<a href="https://colab.research.google.com/github/Saadkhalid913/ML-Practice/blob/main/FINAL_NLP_MODEL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import nltk 
from nltk.stem import wordnet, WordNetLemmatizer
import re 
nltk.download("stopwords")
nltk.download('wordnet')

import matplotlib.pyplot as plt 
import numpy as np 
import pandas as pd 

from sklearn.model_selection import train_test_split 
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
import tensorflow as tf
from tensorflow.keras.layers import Dense
from sklearn.metrics import accuracy_score

EnglishStopwords = nltk.corpus.stopwords.words("english")


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
data = pd.read_csv("train.txt", sep = ";", header=None)
x = data.iloc[ : , : -1].values
y = data.iloc[ : , -1 : ].values

trainX, testX, trainY, testY = train_test_split(x,y, test_size=0.1)

print(trainX.shape)
print(trainY.shape)
print(testX.shape)
print(testY.shape)

(14400, 1)
(14400, 1)
(1600, 1)
(1600, 1)


In [None]:
def CleanFeatures(features):
  '''
    takes 2D numpy array of text data and 
    removes stopwords, non-alphanumeric characters,
    trailing whitespaces, and applies lemmatization 
  '''

  lemma = WordNetLemmatizer()
  sentences = features.flatten()
  cleaned = []
  for sentence in sentences:
      sentence = re.sub("[^a-zA-Z]", " ", sentence)
      sentence = sentence.lower()
      sentence = sentence.split()
      sentence = [lemma.lemmatize(word) for word in sentence if word not in set(EnglishStopwords)]
      sentence = " ".join(sentence)
      cleaned.append(sentence)

  
  return cleaned 

trainX = CleanFeatures(trainX)

In [None]:
def Tokenize(sentences):
  ''' 
    Takes a 1D string of sentences and tokenizes them
    with 150 tokens by default
  '''
  tokenizer = tf.keras.preprocessing.text.Tokenizer()
  tokenizer.fit_on_texts(sentences)
  sequences = tokenizer.texts_to_sequences(sentences)
  sequences = tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = 150, dtype='int32')
  return sequences, tokenizer
def TokenizeTestData(testData, tokenizerObject):
  '''
    testData: 1D array of sentences
  '''
  sequences = tokenizerObject.texts_to_sequences(testData)
  return tf.keras.preprocessing.sequence.pad_sequences(sequences, maxlen = 150, dtype='int32')


In [None]:
trainX , tokenizer = Tokenize(trainX)

In [None]:
num_words = len(tokenizer.index_word) + 1

In [None]:
encoder = OneHotEncoder()
trainY = encoder.fit_transform(trainY).toarray()

In [None]:
def CreateModel():
  model = tf.keras.models.Sequential()
  model.add(tf.keras.layers.Embedding(num_words, 240, input_length=150))
  model.add(tf.keras.layers.Flatten())
  model.add(tf.keras.layers.Dense(units = 64, activation="relu"))
  model.add(tf.keras.layers.Dropout(rate = 0.15))
  model.add(tf.keras.layers.Dense(units = 48, activation="relu"))
  model.add(tf.keras.layers.Dense(units = 6, activation="softmax"))
  model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])
  return model 

In [None]:
ann2 = CreateModel()
ann2.summary()
ann2.fit(trainX, trainY, epochs = 5, batch_size = 64)
ann2.save_weights("test")

Model: "sequential_5"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_5 (Embedding)     (None, 150, 240)          3063840   
                                                                 
 flatten_5 (Flatten)         (None, 36000)             0         
                                                                 
 dense_15 (Dense)            (None, 64)                2304064   
                                                                 
 dropout_5 (Dropout)         (None, 64)                0         
                                                                 
 dense_16 (Dense)            (None, 48)                3120      
                                                                 
 dense_17 (Dense)            (None, 6)                 294       
                                                                 
Total params: 5,371,318
Trainable params: 5,371,318
No

In [None]:
testX = CleanFeatures(testX)
testX = TokenizeTestData(testX, tokenizer)

In [None]:
testY = encoder.transform(testY).toarray()
np.array(testY).shape
np.array(testX).shape
result = ann2.predict(testX)

In [None]:
# y_acc = encoder.transform(testY)
y_truth = np.argmax(testY, axis = 1)
y_preds = np.argmax(result, axis = 1)

correct_preds = y_preds == y_truth 
print(np.sum(correct_preds) / 1600)

# print(correct_preds)





0.838125


In [None]:
def MakePred(s):
  s = CleanFeatures(np.array([[s]]))
  s = TokenizeTestData(s, tokenizer)
  result = ann2.predict(s)
  emotions = ["anger", "fear", "joy", "love", "sadness", "surprise"]
  final = np.argmax(result, axis = 1)[0]
  final = emotions[final]
  # result = list(map(float, result))
  # final = { emotions[i]: result[i] for i in range(6)}
  return final



In [None]:
## PRED PIPELINE 

preds = [
         "I had a horrible day today",
         "thats crazy bro",
         "i am not having a good time rn, im so sad",
         "i need to work harder",
         "im having the best day ever",
         "evan is such a great guy",
         "i wish that I had the courage to read more",
         "i wish I believed in myself",
         "im scared",
         "I wish that I could be born in a world where justice was upheld. I wish I could live in a world where I feel the need to prove my worth"
]

for sen in preds:
  result = MakePred(sen)
  print(f"{sen} -- {result}")


I had a horrible day today -- sadness
thats crazy bro -- sadness
i am not having a good time rn, im so sad -- sadness
i need to work harder -- sadness
im having the best day ever -- joy
evan is such a great guy -- joy
i wish that I had the courage to read more -- sadness
i wish I believed in myself -- fear
im scared -- fear
I wish that I could be born in a world where justice was upheld. I wish I could live in a world where I feel the need to prove my worth -- joy


In [None]:
import pickle 
with open("tokenizer.pkl", "wb") as f:
  f.write(pickle.dumps(tokenizer))

In [None]:
arr = np.array([[1,1],[2,2]])

np.sum(arr, axis = 0)

array([3, 3])

In [None]:
!pip3 freeze

absl-py==0.12.0
alabaster==0.7.12
albumentations==0.1.12
altair==4.1.0
appdirs==1.4.4
argcomplete==1.12.3
argon2-cffi==21.1.0
arviz==0.11.4
astor==0.8.1
astropy==4.3.1
astunparse==1.6.3
atari-py==0.2.9
atomicwrites==1.4.0
attrs==21.2.0
audioread==2.1.9
autograd==1.3
Babel==2.9.1
backcall==0.2.0
beautifulsoup4==4.6.3
bleach==4.1.0
blis==0.4.1
bokeh==2.3.3
Bottleneck==1.3.2
branca==0.4.2
bs4==0.0.1
CacheControl==0.12.10
cached-property==1.5.2
cachetools==4.2.4
catalogue==1.0.0
certifi==2021.10.8
cffi==1.15.0
cftime==1.5.1.1
chardet==3.0.4
charset-normalizer==2.0.7
click==7.1.2
cloudpickle==1.3.0
cmake==3.12.0
cmdstanpy==0.9.5
colorcet==2.0.6
colorlover==0.3.0
community==1.0.0b1
contextlib2==0.5.5
convertdate==2.3.2
coverage==3.7.1
coveralls==0.5
crcmod==1.7
cufflinks==0.17.3
cvxopt==1.2.7
cvxpy==1.0.31
cycler==0.11.0
cymem==2.0.6
Cython==0.29.24
daft==0.0.4
dask==2.12.0
datascience==0.10.6
debugpy==1.0.0
decorator==4.4.2
defusedxml==0.7.1
descartes==1.1.0
dill==0.3.4
distributed==1.25.3


In [None]:
from pip._internal.utils.misc import get_installed_distributions
import sys
#import numpy as np # imported to test whether numpy shows up, which it does!

def get_imported_packages():
    p = get_installed_distributions()
    p = {package.key:package.version for package in p}

    imported_modules = set(sys.modules.keys())
    
    imported_modules.remove('pip')

    modules = [(m, p[m]) for m in imported_modules if p.get(m, False)]

    return modules


def generate_requirements(filepath:str, modules):
    with open(filepath, 'w') as f:
        for module, version in modules:
            f.write(f"{module}=={version}\n")


generate_requirements('requirements.txt', get_imported_packages())

'2.7.0'