In [4]:
# code
# Import necessary libraries
import re
import numpy as np
import pandas as pd
from tqdm import tqdm 

# Read the 'big.txt' file, which presumably contains text data
with open('big.txt', 'r') as fd:
    lines = fd.readlines()
    words = []
    
    # Tokenize the text by extracting words (alphanumeric sequences) and converting them to lowercase
    for line in lines:
        words += re.findall('\w+', line.lower())

# Function to find N-grams
def get_pairs(words, n):
    # Increment N by 1 to create N-grams
    n = n + 1
    data = []
    
    # Loop through the words to extract N-grams
    for i in range(len(words) - n):
        data.append(' '.join(words[i:i + n]))
    
    return data

# Function to calculate occurrence probabilities
def get_prob_dist(data):
    a = np.array(data)
    
    # Find unique N-grams and their counts using NumPy
    pair, count = np.unique(a, return_counts=True)
    unique_pairs = list(set(data))
    
    prob_dist = []
    
    # Create a list of N-gram, preceding words, following word, and frequency
    for i in range(len(unique_pairs)):
        prob_dist.append([unique_pairs[i], ' '.join(unique_pairs[i].split(' ')[:-1]), unique_pairs[i].split(' ')[-1], count[i]])
    
    return prob_dist

# Generate N-grams (four-grams in this case)
data = get_pairs(words, 4)

# Calculate occurrence probabilities for the N-grams
prob_dist = get_prob_dist(data)

### Sentence Generation

In [5]:
data = get_pairs(words, 4)
prob_dist = get_prob_dist(data)

# Create a DataFrame to store the word pairs and their frequencies
df = pd.DataFrame(prob_dist, columns=['seq', 'inp', 'out', 'freq'])

# Function to predict next word based on an input sequence
def predict(word):
    if len(df[df['inp'] == word]):
        # Filter DataFrame to find matching input sequences
        df_ = df[df['inp'] == word]
        # Sort by frequency and take the top results, then retrieve the 'out' values
        return df_.sort_values(by='freq').head()['out'].values
    else:
        print('Seq is not present')

# Predict the next word for the given input sequence
predict('this is a beautiful')

# Predict the next word for another input sequence
predict('the is a beautiful')

# Function to predict the next 'n' words in a sequence
def pred_seq(seq, n):
    output = []
    output.append(seq)

    for i in range(n):
        pred = predict(seq)
        # Update the input sequence by removing the first word and adding the predicted word
        seq = ' '.join(seq.split(' ')[1:]) + ' ' + pred[0]
        output.append(pred[0])

    return ' '.join(output)

# Predict the next 50 words in a sequence starting with 'of the united states'
pred_seq('of the united states', 50)

Seq is not present


'of the united states demanded the immediate ratification of the treaty the democrats and populists took exception in the senate where southern interests were intrenched then after the senate was won over a democratic president james buchanan vetoed the bill still the issue lived the republicans strong among the farmers of the northwest favored'

### Context word Prediction

In [6]:
def get_prob_dist(data):
    # Initialize an empty list to store probability distribution data
    prob_dist = []    
    # Convert input data to a NumPy array
    a = np.array(data)
    # Find unique pairs and their counts from the input data
    pairs, counts = np.unique(a, return_counts = True)
    
    # Iterate through each unique pair
    for i in range(len(pairs)):
        
        # Extract left, right, and middle sequences from the pair
        left_seq   = ' '.join(pairs[i].split(' ')[:len(pairs[i].split(' '))//2])
        right_seq  = ' '.join(pairs[i].split(' ')[len(pairs[i].split(' '))//2 + 1: ])
        middle_seq = pairs[i].split(' ')[len(pairs[i].split(' '))//2]
        
        # Store the pair and its components along with frequency in the probability distribution list
        prob_dist.append([pairs[i],left_seq, right_seq, middle_seq, counts[i]])
        
    return prob_dist
        
# Generate sequences of three words (3-grams)   
data = get_pairs(words,3)
# Calculate probability distribution of the generated sequences
prob_dist = get_prob_dist(data)
# Create a DataFrame from the probability distribution data
df = pd.DataFrame(prob_dist, columns = ['seq','left_seq','right_seq','output','freq'])
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,seq,left_seq,right_seq,output,freq
0,0 05 grm novarsenbillon,0 05,novarsenbillon,grm,1
1,0 25 u and,0 25,and,u,1
2,0 45 grm given,0 45,given,grm,1
3,0 5 to 2,0 5,2,to,1
4,0 6 grm all,0 6,all,grm,1


In [7]:
def predict(word):
    # Split the input word into left and right sequences
    left_seq = word.split('_')[0].strip()
    right_seq = word.split('_')[1].strip()

    # Filter the DataFrame to find matches for the left and right sequences
    df_ = df[df['left_seq'] == left_seq]
    df_ = df_[df_['right_seq'] == right_seq]

    # Sort the matches by frequency in descending order and extract the top outputs
    return list(df_.sort_values(by='freq', ascending=False).head()['output'].values)

# Predict words based on the given left and right sequences
predict('the _ states')

[]