# Import necessary libraries

In [None]:
#pip install pandas nltk gensim scikit-learn python-docx

In [None]:
import pandas as pd
import re
import time
import numpy as np
from nltk.util import ngrams
from gensim.models import Word2Vec
from sklearn.metrics import accuracy_score
from collections import defaultdict, Counter
from sklearn.model_selection import train_test_split
import random
from docx import Document
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Dataset

In [None]:
doc_path = "wikipedia.docx"
doc = Document(doc_path)

# Preprocessing

In [None]:
#Extract text from paragraphs
text_data = [paragraph.text for paragraph in doc.paragraphs]
#Convert to lowercase
text_data = [text.lower() for text in text_data]
#Remove special characters using regex
text_data = [re.sub(r"\[.*?\]", "", text) for text in text_data]
#Remove non-english alphabets
english_alphabet = set(string.ascii_lowercase)
text_data = [' '.join([word for word in text.split() if all(char in english_alphabet for char in word)]) for text in text_data]
#Removing leading/trailing whitespaces and empty sentences
text_data = [text for text in text_data if text]
#remove stop words
text = ' '.join([word for word in text.split() if word not in stop_words])
text = text.strip()
#lemmatization
processed_text_data = []
for text in text_data:
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    processed_text = ' '.join(lemmatized_words)
    processed_text = processed_text.strip()
    processed_text_data.append(processed_text)
#Create dataframe
df = pd.DataFrame({"Text": processed_text_data})
output_path = "output.csv"
#Remove index
df.to_csv(output_path, index=False)
df = pd.read_csv("output.csv")

In [None]:
df

Unnamed: 0,Text
0,robert
1,robert english film television theatre actor g...
2,starred alongside play written mark appeared e...
3,career
4,guest starring role television series bill por...
...,...
1220,top sport film
1221,legacy
1222,decade since release cemented reputation class...
1223,paul newman reprised role fast eddie film colo...


# Train-test split

In [None]:
train_data, test_data = train_test_split(df['Text'], test_size=0.2, random_state=42)
test_data = [text for text in test_data if text is not None]

# Evaluation

In [None]:
results = {
    'Model': [],
    'Accuracy': [],
    'Avg Keystrokes Saved': [],
    'Avg Completion Time': [],
    'Context-Capture Score': []
}
def append_results(model_name, accuracy, keystrokes_saved, completion_time, context_score):
    results['Model'].append(model_name)
    results['Accuracy'].append(accuracy)
    results['Avg Keystrokes Saved'].append(keystrokes_saved)
    results['Avg Completion Time'].append(completion_time)
    results['Context-Capture Score'].append(context_score)



1.   N-gram model



In [None]:
def build_ngram_model(corpus, n=3):
    model = defaultdict(Counter)
    for sentence in corpus:
        tokens = sentence.split()
        for i in range(len(tokens) - n + 1):
            ngram_tuple = tuple(tokens[i:i + n - 1])
            next_word = tokens[i + n - 1]
            model[ngram_tuple][next_word] += 1
    return model

In [None]:
def predict_ngram(model, prefix):
    ngram_tuple = tuple(prefix.split()[-2:])
    if ngram_tuple in model:
        return model[ngram_tuple].most_common(1)[0][0]
    return None

In [None]:
train_data = df['Text'].dropna().astype(str).tolist()
test_data = df['Text'].dropna().astype(str).tolist()
ngram_model = build_ngram_model(train_data)
start_time = time.time()
ngram_predictions = []
valid_actuals = []

for text in test_data:
    words = text.split()
    if len(words) > 2:  # Ensure there's at least one word to predict
        prefix = " ".join(words[:-1])
        prediction = predict_ngram(ngram_model, prefix)
        if prediction:  # Only append if prediction is not None
            ngram_predictions.append(prediction)
            valid_actuals.append(words[-1])  # Only append the actual last word if valid

completion_time = time.time() - start_time
if valid_actuals and ngram_predictions:  # Check if there are any valid predictions and actuals
    accuracy = accuracy_score(valid_actuals, ngram_predictions)
    keystrokes_saved = sum(len(pred) for pred in ngram_predictions) / len(ngram_predictions)
    append_results('N-gram', accuracy, keystrokes_saved, completion_time, 1)

2. Word2Vec

In [None]:
word2vec_model = Word2Vec([text.split() for text in train_data], vector_size=250, window=5, sg =1, epochs = 50, min_count=1, workers=4)

In [None]:
def predict_word2vec(model, prefix):
    words = prefix.split()
    if words and words[-1] in model.wv.key_to_index:
        similar_words = model.wv.most_similar(words[-1])
        return similar_words[0][0]
    return None

# Evaluation

In [None]:

start_time = time.time()
word2vec_predictions = []
valid_actuals_w2v = []

for text in test_data:
    words = text.split()
    if len(words) > 1:  # Ensure there's at least one word to predict
        prefix = " ".join(words[:-1])
        prediction = predict_word2vec(word2vec_model, prefix)
        if prediction:  # Only append if prediction is not None
            word2vec_predictions.append(prediction)
            valid_actuals_w2v.append(words[-1])  # Only append the actual last word if valid

completion_time = time.time() - start_time
if valid_actuals_w2v and word2vec_predictions:  # Check if there are any valid predictions and actuals
    accuracy = accuracy_score(valid_actuals_w2v, word2vec_predictions)
    keystrokes_saved = sum(len(pred) for pred in word2vec_predictions) / len(word2vec_predictions)
    append_results('Word2Vec', accuracy, keystrokes_saved, completion_time, 2)

In [None]:
results_df = pd.DataFrame(results)
print(results_df)

      Model  Accuracy  Avg Keystrokes Saved  Avg Completion Time  \
0    N-gram  0.924569              6.201509             0.016903   
1  Word2Vec  0.084440              7.241935             0.658448   
2  Word2Vec  0.096774              7.197343             2.220603   
3  Word2Vec  0.052182              7.051233             0.774965   
4  Word2Vec  0.090133              7.257116             0.794732   

   Context-Capture Score  
0                      1  
1                      2  
2                      2  
3                      2  
4                      2  
