In [105]:
# from typing import Sequence
import numpy as np
from typing import List
import random
from collections import defaultdict


## Q1

In [139]:
# Monte-Carlo simulation
def prob_rain_more_than_n(p: Sequence[float], n: int) -> float:
    total_days = len(p)
    trials = 10000
    
    # Run multiple simulations
    simulations = np.random.rand(trials, total_days) < p  # Generate matrix with random weather of [size trials X total_days]
    rainy_days_per_trial = np.sum(simulations, axis=1) # Count rainy days per trial
    
    # Count how many trials had more than n rainy days
    successful_trials = np.sum(rainy_days_per_trial > n)
    
    # Return the fraction of successful trials as the probability
    return successful_trials / trials

In [140]:
# Function to generate a list of probabilities between 0.1 and 0.5
def generate_probabilities(min_value: float, max_value: float, days: int) -> List[float]:
    return [random.uniform(min_value, max_value) for _ in range(days)]

# Test case 1: Probability of rain is 0.3 on each day
p_1 = [0.3] * 365
n_1 = 100
result_1 = prob_rain_more_than_n(p_1, n_1)
print(f"Test Case 1 - Probability of more than {n_1} rainy days: {result_1:.6f}")

# Test case 2: Probability of rain varies between 0.1 and 0.9
p_2 = [0.1, 0.9] * (365 // 2) + [0.5]  # alternating 0.1 and 0.9, with last day 0.5
n_2 = 180
result_2 = prob_rain_more_than_n(p_2, n_2)
print(f"Test Case 2 - Probability of more than {n_2} rainy days: {result_2:.6f}")

# Test case 3: Probability of rain varies between 0.1 and 0.5
p_3 = generate_probabilities(0.1, 0.5, 365)  # 365 probabilities between 0.1 and 0.5
n_3 = 100
result_3 = prob_rain_more_than_n(p_3, n_3)
print(f"Test Case 3 - Probability of more than {n_3} rainy days: {result_3:.6f}")



Test Case 1 - Probability of more than 100 rainy days: 0.852900
Test Case 2 - Probability of more than 180 rainy days: 0.636900
Test Case 3 - Probability of more than 100 rainy days: 0.892600


## Q2

In [107]:
# Sample pronunciation dictionary
pronunciation_dict = {
    "ABACUS": ["AE", "B", "AH", "K", "AH", "S"],
    "BOOK": ["B", "UH", "K"],
    "THEIR": ["DH", "EH", "R"],
    "THERE": ["DH", "EH", "R"],
    "TOMATO_1": ["T", "AH", "M", "AA", "T", "OW"],
    "TOMATO_2": ["T", "AH", "M", "EY", "T", "OW"]
}

In [111]:
# Preprocess the dictionary to create a phoneme-to-words map
def preprocess_pronunciation_dict(pronunciation_dict):
    phoneme_to_words = defaultdict(list)
    for word, phonemes in pronunciation_dict.items():
        phoneme_tuple = tuple(phonemes)
        phoneme_to_words[phoneme_tuple].append(word)
    return phoneme_to_words

# Function to find all combinations of words with the given phoneme sequence
def find_word_combos_with_pronunciation(phonemes: Sequence[str]) -> List[Sequence[str]]:
    phoneme_to_words = preprocess_pronunciation_dict(pronunciation_dict)
    
    # Recursive function to find combinations
    def find_combinations(start):
        if start == len(phonemes):
            return [[]]  # If we reach the end, return an empty list as the base case
        results = []
        for length in range(1, len(phonemes) - start + 1):
            sub_phonemes = tuple(phonemes[start:start + length])
            if sub_phonemes in phoneme_to_words:
                for word in phoneme_to_words[sub_phonemes]:
                    for rest in find_combinations(start + length):
                        results.append([word] + rest)
        return results
    
    return find_combinations(0)

In [116]:
# Example usage: exact match
phonemes_1 = ["DH", "EH", "R", "DH", "EH", "R"]
combinations_1 = find_word_combos_with_pronunciation(phonemes_1)
for combo in combinations_1:
    print(combo)

['THEIR', 'THEIR']
['THEIR', 'THERE']
['THERE', 'THEIR']
['THERE', 'THERE']


In [114]:
# Example usage: no match
phonemes_2 = ["X", "Y", "Z"]
combinations_2 = find_word_combos_with_pronunciation(phonemes_2)

if combinations_2:
    for combo in combinations_2:
        print(combo)
else:
    print("No words match the phoneme given !")

No words match the phoneme given !


In [128]:
# Example usage: exact match
phonemes_3 = ["T", "AH", "M", "AA", "T", "OW", "T", "AH", "M", "EY", "T", "OW"]
combinations_3 = find_word_combos_with_pronunciation(phonemes_3)

if combinations_3:
    for combo in combinations_3:
        print(combo)
else:
    print("No words match the phoneme given !")

['TOMATO_1', 'TOMATO_2']


#### Experimental NLP approach

In [121]:
# Example word to phoneme data
data = [
    ("THEIR", ["DH", "EH", "R"]),
    ("THERE", ["DH", "EH", "R"]),
    ("ABACUS", ["AE", "B", "AH", "K", "AH", "S"]),
    ("TOMATO", ["T", "AH", "M", "AA", "T", "OW"]),
    # Add more words and phonemes
]


In [122]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Tokenize phonemes
phoneme_tokenizer = Tokenizer()
phoneme_tokenizer.fit_on_texts([item[1] for item in data])
phoneme_sequences = phoneme_tokenizer.texts_to_sequences([item[1] for item in data])

# Tokenize words
word_tokenizer = Tokenizer()
word_tokenizer.fit_on_texts([item[0] for item in data])
word_sequences = word_tokenizer.texts_to_sequences([item[0] for item in data])


In [126]:
# import numpy as np
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

# # Define model parameters
# embedding_size = 64
# hidden_size = 128
# num_phonemes = len(phoneme_tokenizer.word_index) + 1  # Vocabulary size for phonemes
# num_words = len(word_tokenizer.word_index) + 1  # Vocabulary size for words

# # Encoder
# encoder_inputs = Input(shape=(None,))  # Variable length input
# encoder_embedding = Embedding(input_dim=num_phonemes, output_dim=embedding_size)(encoder_inputs)  # Embed phoneme indices
# encoder_lstm = LSTM(hidden_size, return_state=True)
# encoder_outputs, state_h, state_c = encoder_lstm(encoder_embedding)
# encoder_states = [state_h, state_c]

# # Decoder
# decoder_inputs = Input(shape=(None,))
# decoder_embedding = Embedding(input_dim=num_words, output_dim=embedding_size)(decoder_inputs)  # Embed word indices
# decoder_lstm = LSTM(hidden_size, return_sequences=True, return_state=True)
# decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
# decoder_dense = Dense(num_words, activation='softmax')
# decoder_outputs = decoder_dense(decoder_outputs)

# # Seq2Seq model
# model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
# model.compile(optimizer='rmsprop', loss='categorical_crossentropy')

# # Train the model
# # Note: Ensure `phoneme_sequences` and `word_sequences` are properly tokenized and padded
# model.fit([phoneme_sequences, word_sequences], epochs=50, batch_size=64)

