<a href="https://colab.research.google.com/github/Saadkhalid913/ML-Practice/blob/main/ImprovedNLPModel.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk 
from nltk.stem import wordnet, WordNetLemmatizer
import re 
nltk.download("stopwords")
nltk.download('wordnet')

import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score

EnglishStopwords = nltk.corpus.stopwords.words("english")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [None]:
data = pd.read_csv("train.txt", sep = ";")
x = data.iloc[ : , : -1].values
y = data.iloc[ : , -1 : ].values

trainX, testX, trainY, testY = train_test_split(x,y, test_size=0.1)

print(trainX.shape)
print(trainY.shape)
print(testX.shape)
print(testY.shape)

(14399, 1)
(14399, 1)
(1600, 1)
(1600, 1)


In [None]:
def CleanFeatures(features):
  '''
    takes 2D numpy array of text data and 
    removes stopwords, non-alphanumeric characters,
    trailing whitespaces, and applies lemmatization 
  '''

  lemma = WordNetLemmatizer()
  sentences = features.flatten()
  cleaned = []
  for sentence in sentences:
      sentence = re.sub("[^a-zA-Z]", " ", sentence)
      sentence = sentence.lower()
      sentence = sentence.split()
      sentence = [lemma.lemmatize(word) for word in sentence if word not in set(EnglishStopwords)]
      sentence = " ".join(sentence)
      cleaned.append(sentence)

  
  return cleaned 

trainX = CleanFeatures(trainX)

In [None]:
def Tokenize(sentences):
  ''' 
    Takes a 1D string of sentences and tokenizes them
    with 150 tokens by default
  '''
  tokenizer = tf.keras.preprocessing.text.Tokenizer()
  tokenizer.fit_on_texts(sentences)
  sequences = tokenizer.texts_to_sequences(sentences)
  sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = 50, dtype='int32')
  return sequences, tokenizer
def TokenizeTestData(testData, tokenizerObject):
  '''
    testData: 1D array of sentences
  '''
  sequences = tokenizerObject.texts_to_sequences(testData)
  return tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = 50, dtype='int32')


In [None]:
trainX , tokenizer = Tokenize(trainX)

In [None]:
num_words = len(tokenizer.index_word) + 1

In [None]:
encoder = OneHotEncoder()
trainY = encoder.fit_transform(trainY).toarray()

In [None]:
def CreateModel():
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Embedding(num_words, 480, input_length=50))
  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(units = 128, activation="relu"))
  model.add(tf.keras.layers.Dense(units = 64, activation="relu"))
  model.add(tf.keras.layers.Dense(units = 6, activation="softmax"))
  model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
  return model 

In [None]:
ann = CreateModel()
ann.summary()
ann.fit(trainX, trainY, epochs = 25, batch_size = 32)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 480)           6143040   
                                                                 
 flatten (Flatten)           (None, 24000)             0         
                                                                 
 dense (Dense)               (None, 128)               3072128   
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 6)                 390       
                                                                 
Total params: 9,223,814
Trainable params: 9,223,814
Non-trainable params: 0
_________________________________________________________________
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Ep

<keras.callbacks.History at 0x7f4499ec8710>

In [None]:
# ann.save_weights("HIGH_PARAM_MODEL")

In [None]:
testX = CleanFeatures(testX)
testX = TokenizeTestData(testX, tokenizer)

In [None]:
testY = encoder.transform(testY).toarray()
np.array(testY).shape
np.array(testX).shape
result = ann.predict(testX)

In [None]:
# y_acc = encoder.transform(testY)
y_truth = np.argmax(testY, axis = 1)
y_preds = np.argmax(result, axis = 1)

correct_preds = y_preds == y_truth 
print(np.sum(correct_preds) / 1600)

# print(correct_preds)



0.83875
