In [54]:
!pip install python-docx
import nltk
nltk.download('punkt')
nltk.download('stopwords')



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [55]:
#import statements
import docx
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import string
import pandas as pd
from docx import Document

In [56]:
#Read the document
file_path = "/content/wikipedia.docx"

doc = docx.Document(file_path)


# Read the DOCX file and extract text
def read_docx(doc):
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + " "
    return text
text = read_docx(doc)


# LSTM Model: Sentence Completion
### Preprocessing for LSTM

In [57]:
#Preprocessing for LSTM

text_data = [paragraph.text for paragraph in doc.paragraphs]  # Extract text from paragraphs
text_data = [text.lower() for text in text_data] # Convert text to lowercase
text_data = [re.sub(r"\[.*?\]", "", text) for text in text_data] # Remove special characters and words between them using regex
english_alphabet = set(string.ascii_lowercase) # Remove words not in the English alphabet
text_data = [' '.join([word for word in text.split() if all(char in english_alphabet for char in word)]) for text in text_data]
text_data = [text.strip() for text in text_data if text.strip()] # Remove leading/trailing whitespaces

# Create a DataFrame with the cleaned text data
df = pd.DataFrame({"Text": text_data})
output_path = "output.csv"  # Save the cleaned text data to a CSV file
df.to_csv(output_path, index=False) # Set index=False to exclude the index column in the output

print("Text data cleaned and saved to:", output_path)


Text data cleaned and saved to: output.csv


In [58]:
import torch
import pandas as pd
from collections import Counter

class TextDataset(torch.utils.data.Dataset):
    def __init__(self, args):
        self.args = args
        self.words = self.load_words()
        self.unique_words = self.get_unique_words()

        self.index_to_word = {index: word for index, word in enumerate(self.unique_words)}
        self.word_to_index = {word: index for index, word in enumerate(self.unique_words)}

        self.word_indexes = [self.word_to_index[w] for w in self.words]

    def load_words(self):
        train_df = pd.read_csv('/content/output.csv')
        text = train_df['Text'].str.cat(sep=' ')
        return text.split(' ')

    def get_unique_words(self):
        word_counts = Counter(self.words)
        return sorted(word_counts.keys(), key=word_counts.get, reverse=True)

    def __len__(self):
        return len(self.word_indexes) - self.args

    def __getitem__(self, index):
        return (
            torch.tensor(self.word_indexes[index:index + self.args]),
            torch.tensor(self.word_indexes[index + 1:index + self.args + 1])
        )


###Building the LSTM Model

In [59]:
from torch import nn

class LSTMModel(nn.Module):
    def __init__(self, dataset):
        super(LSTMModel, self).__init__()
        self.lstm_size = 128
        self.embedding_dim = 128
        self.num_layers = 3

        n_vocab = len(dataset.unique_words)
        self.embedding = nn.Embedding(
            num_embeddings=n_vocab,
            embedding_dim=self.embedding_dim,
        )
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.lstm_size,
            num_layers=self.num_layers,
            dropout=0.2,
        )
        self.fc = nn.Linear(self.lstm_size, n_vocab)

    def forward(self, x, prev_state):
        embed = self.embedding(x)
        output, state = self.lstm(embed, prev_state)
        logits = self.fc(output)

        return logits, state

    def init_state(self, sequence_length):
        return (
            torch.zeros(self.num_layers, sequence_length, self.lstm_size),
            torch.zeros(self.num_layers, sequence_length, self.lstm_size)
        )


In [60]:
from torch.utils.data import DataLoader, random_split

# Hyperparameters
sequence_length = 10
batch_size = 64
learning_rate = 0.001
num_epochs = 5

# Create the dataset
dataset = TextDataset(sequence_length)

# Split the dataset into training and validation sets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# Create the model
model = LSTMModel(dataset)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0

    for batch in train_loader:
        inputs, targets = batch

        optimizer.zero_grad()

        hidden = model.init_state(sequence_length)
        outputs, _ = model(inputs, hidden)

        loss = criterion(outputs.view(-1, len(dataset.unique_words)), targets.view(-1))
        loss.backward()

        optimizer.step()

        total_loss += loss.item()

    # Calculate average loss for the epoch
    average_loss = total_loss / len(train_loader)

    # Print the epoch and average loss
    print(f"Epoch [{epoch+1}/{num_epochs}], Average Loss: {average_loss:.4f}")

    # Validation loop
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for batch in val_loader:
            inputs, targets = batch

            hidden = model.init_state(sequence_length)
            outputs, _ = model(inputs, hidden)

            loss = criterion(outputs.view(-1, len(dataset.unique_words)), targets.view(-1))
            val_loss += loss.item()

    # Calculate average validation loss for the epoch
    average_val_loss = val_loss / len(val_loader)

    # Print the epoch and average validation loss
    print(f"Epoch[{epoch+1}/{num_epochs}], Validation Loss: {average_val_loss: .4f}")


Epoch [1/5], Average Loss: 6.8169
Epoch[1/5], Validation Loss:  6.6144
Epoch [2/5], Average Loss: 6.4623
Epoch[2/5], Validation Loss:  6.3164
Epoch [3/5], Average Loss: 6.1956
Epoch[3/5], Validation Loss:  6.1031
Epoch [4/5], Average Loss: 6.0412
Epoch[4/5], Validation Loss:  5.9756
Epoch [5/5], Average Loss: 5.9224
Epoch[5/5], Validation Loss:  5.8620


###LSTM Model - Output Generation

In [61]:
def lstmmodel(input_sentence):
  try:
    input_indexes = [dataset.word_to_index[word] for
                    word in input_sentence.split()]
    input_tensor = torch.tensor(input_indexes,
                                dtype=torch.long).unsqueeze(0)
    model.eval()
    hidden = model.init_state(len(input_indexes))
    outputs, _ = model(input_tensor, hidden)
    predicted_index = torch.argmax(outputs[0, -1, :]).item()
    predicted_word = dataset.index_to_word[predicted_index]
    return predicted_word

  except KeyError:
        return None


#TESTING!!!!!!

In [62]:
try:
  input_sentence="he was in"
  # Preprocess the input sentence
  input_indexes = [dataset.word_to_index[word] for word in input_sentence.split()]
  input_tensor = torch.tensor(input_indexes,dtype=torch.long).unsqueeze(0)
      # Generate the next word
  model.eval()
  hidden = model.init_state(len(input_indexes))
  outputs, _ = model(input_tensor, hidden)
  predicted_index = torch.argmax(outputs[0, -1, :]).item()
  predicted_word = dataset.index_to_word[predicted_index]

      # Print the predicted word
  print("Input Sentence:", input_sentence)
  print("Predicted Next Word:", predicted_word)

  #return predicted_word
except KeyError:
      # Return None if there is no suitable prediction
      print("None")

Input Sentence: he was in
Predicted Next Word: the


#TRIE Model: Word Completion

###Building the TRIE Model

In [63]:
#TRIE Model
class TrieNode:
    def __init__(self):
        self.children = {}
        self.is_end_of_word = False

class Trie:
    def __init__(self):
        self.root = TrieNode()

    def insert_word(self, word):
        node = self.root
        for char in word:
            if char not in node.children:
                node.children[char] = TrieNode()
            node = node.children[char]
        node.is_end_of_word = True

    def search_word(self, prefix):
        node = self.root
        for char in prefix:
            if char not in node.children:
                return []
            node = node.children[char]
        return self._collect_words_with_prefix(node, prefix)

    def _collect_words_with_prefix(self, node, prefix):
        results = []
        if node.is_end_of_word:
            results.append(prefix)
        for char, child_node in node.children.items():
            results.extend(self._collect_words_with_prefix(child_node, prefix + char))
        return results




def preprocess_words(text):
    stop_words = set(stopwords.words('english'))

    # Tokenize the text
    words = word_tokenize(text)

    # Apply preprocessing steps
    preprocessed_words = []
    for word in words:
        # Convert to lowercase
        word = word.lower()

        # Remove non-alphabetic characters
        word = re.sub(r'[^a-zA-Z]', '', word)

        # Remove stop words
        if word not in stop_words and word != "unk":
            preprocessed_words.append(word)

    return preprocessed_words


def build_trie(words):
    trie = Trie()
    for word in words:
        trie.insert_word(word)
    return trie

preprocessed_words = preprocess_words(text)
trie = build_trie(preprocessed_words)

#TESTING!!!!!

In [64]:
prefix = "ser"
word_completions = trie.search_word(prefix)
print("Word Completions for prefix '{}': {}".format(prefix, word_completions))

Word Completions for prefix 'ser': ['series', 'serious', 'serve', 'served', 'serves', 'servant', 'servants', 'service', 'services', 'serviced', 'servicemen', 'serving', 'serb', 'serbs', 'serbia', 'sergeant']


#Getting the Best Suggestions And Predictions

###Sort Model Suggestions

In [65]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

def tf_idf_relevance(user_input, suggestions, corpus):

  #user_input = preprocess_text(user_input)
  #suggestions = [preprocess_text(s) for s in suggestions]
  #corpus = [preprocess_text(doc) for doc in corpus]

  vectorizer = TfidfVectorizer()

  vectorizer.fit(corpus)

  user_input_vector = vectorizer.transform([user_input])
  suggestions_vectors = vectorizer.transform(suggestions)
  corpus_vectors = vectorizer.transform(corpus)

  cosine_similarities = user_input_vector.dot(suggestions_vectors.T).toarray()[0]

  ranked_suggestions = sorted(zip(suggestions, cosine_similarities), key=lambda x: x[1], reverse=True)

  return dict(ranked_suggestions)

##Combine outputs of Trie and LSTM Model

In [67]:
user_input = "He was act"
user_input = user_input.lower()
# Tokenize the user input
tokens = user_input.split()

# Extract prefix for Trie model (last word)
trie_prefix = tokens[-1]

# Extract input sequence for LSTM model (all words)
lstm_input_sequence = " ".join(tokens)

# Generate suggestions from Trie model using prefix
trie_suggestions = trie.search_word(trie_prefix)

# Generate predictions from LSTM model using input sequence
lstm_predictions = lstmmodel(lstm_input_sequence)

#print("lstm_predictions",lstm_predictions)
#print("trie_suggestions",trie_suggestions)



combined_suggestions = trie_suggestions + [lstm_predictions]

# Assign weights to suggestions based on source
trie_weight = 0.5
lstm_weight = 0.7

# Calculate relevance scores based on weights
relevance_scores = {}
for suggestion in combined_suggestions:
    relevance_scores[suggestion] = 0
    if suggestion in trie_suggestions:
        relevance_scores[suggestion] += trie_weight
    if suggestion == lstm_predictions:
        relevance_scores[suggestion] += lstm_weight

# Get frequency of each suggestion in the dataset
word_frequencies = {}
for word in preprocessed_words:
    word_frequencies[word] = word_frequencies.get(word, 0) + 1


# Sort suggestions based on relevance and frequency
top_suggestions = sorted(combined_suggestions, key=lambda x: (relevance_scores.get(x, 0), word_frequencies.get(x, 0)), reverse=True)[:5]

train_df = pd.read_csv('/content/output.csv')
text = train_df['Text'].str.cat(sep=' ')
ranked = tf_idf_relevance(user_input,top_suggestions,preprocessed_words)
# Display top suggestions to the user
print("Input Sentence:", user_input)
print("Top Suggestions:", top_suggestions)
#print("Ranked:", ranked)

Input Sentence: he was act
Top Suggestions: ['of', 'action', 'actors', 'actor', 'act']


###TRAIL!!!!!

In [None]:
from collections import Counter

def calculate_frequency(dataset):
    # Count the occurrences of each suggestion in the dataset
    suggestion_counts = Counter(dataset)
    return suggestion_counts

# Example dataset
dataset = preprocessed_words
suggestion_frequencies = calculate_frequency(dataset)
print(suggestion_frequencies)

def sort_suggestions(trie_suggestions, trie_prefix, suggestion_frequencies):
    def suggestion_key(suggestion):
        length_weight = 0.3
        prefix_weight = 0.5
        frequency_weight = 0.2
        return (length_weight * len(suggestion) +
                prefix_weight * (suggestion.startswith(trie_prefix)) +
                frequency_weight * suggestion_frequencies.get(suggestion, 0))

    sorted_suggestions = sorted(trie_suggestions, key=suggestion_key, reverse=True)
    return sorted_suggestions

# Example usage
trie_prefix = "act"
sorted_suggestions = sort_suggestions(trie_suggestions, trie_prefix, suggestion_frequencies)
print(sorted_suggestions)


In [None]:
user_input = "He was ser"
user_input = user_input.lower()
# Tokenize the user input
tokens = user_input.split()

# Extract prefix for Trie model (last word)
trie_prefix = tokens[-1]

# Extract input sequence for LSTM model (all words)
lstm_input_sequence = " ".join(tokens)

# Generate suggestions from Trie model using prefix
trie_suggestions = trie.search_word(trie_prefix)
print(trie_suggestions)
