<a href="https://colab.research.google.com/github/SOUMEE2000/ML-guidelines/blob/main/NLP_using_Deep_Learning(Tensorflow).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
source="https://towardsdatascience.com/natural-language-processing-with-tensorflow-e0a701ef5cef"

## **Examples on using tensorflow**

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [None]:
sentences=["how are you", "you are so nice"]
tokenizer=Tokenizer(num_words=100)            # hundred most common words are tokenized, words are assigned a code
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)

{'are': 1, 'you': 2, 'how': 3, 'so': 4, 'nice': 5}


In [None]:
tokenizer.texts_to_sequences(sentences)           # encoding my statements

[[3, 1, 2], [2, 1, 4, 5]]

In [None]:
sentences1=["you are not so nice","but who cares"]        # code for unknown words not present
tokenizer.texts_to_sequences(sentences1)

[[2, 1, 4, 5], []]

In [None]:
tokenizer=Tokenizer(num_words=100, oov_token="<oov>")   # oov encoding
tokenizer.fit_on_texts(sentences)
print(tokenizer.word_index)
print(tokenizer.texts_to_sequences(sentences1) ) 

{'<oov>': 1, 'are': 2, 'you': 3, 'how': 4, 'so': 5, 'nice': 6}
[[3, 2, 1, 5, 6], [1, 1, 1]]


# **IMDB Dataset Sentiment Analysis**

In [None]:
!unzip IMDB.zip

Archive:  IMDB.zip
  inflating: IMDB Dataset.csv        


In [None]:
import pandas as pd
df=pd.read_csv("IMDB Dataset.csv")

In [None]:
feature=[]
for i in df["sentiment"]:
  if i=="positive":
    feature.append(1)
  elif i=="negative":
    feature.append(0)
df["feature"]=feature

In [None]:
df['review_processed'] = df['review'].str.replace("[^a-zA-Z#]", " ") 
df['review_processed']=[review.lower() for review in df['review_processed']]

# Removing Stopwords Begin
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk import word_tokenize
stop_words = stopwords.words('english')

# Making custom list of words to be removed 
add_words = ['movie','film','one','make','even','the']
stop_words.extend(add_words)

# Function to remove stop words 
def remove_stopwords(rev):
    review_tokenized = word_tokenize(rev)
    rev_new = " ".join([i for i in review_tokenized  if i not in stop_words])
    return rev_new

# Removing stopwords
df['review_processed'] = [remove_stopwords(r) for r in df['review_processed']]

# Replacing short words
df['review_processed'] = df['review_processed'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Importing module
from sklearn.feature_extraction.text import TfidfVectorizer

# Creating sparse matrix of top 2500 tokens
cv = TfidfVectorizer(max_features = 2500)
X = cv.fit_transform(df.review_processed).toarray()
y = df.feature.values

# Splitting the dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)

In [None]:
y_train.shape

(40000,)

In [None]:
y_train=y_train.reshape(40000,1)
X_train=X_train.reshape(40000,2500)

In [None]:
y_train

array([[[0],
        [0],
        [1],
        ...,
        [1],
        [0],
        [1]]])

In [None]:
import tensorflow.keras as tf
model=tf.models.Sequential()
model.add(tf.layers.Flatten())
model.add(tf.layers.Dense(784, activation="sigmoid"))    #Hidden                             
model.add(tf.layers.Dense(2, activation="softmax"))       #Output (2 because we have only 2 classes for classification: positive and negative)
model.compile(optimizer='adam',loss='sparse_categorical_crossentropy'
                ,metrics=['accuracy']) #adam name for gradient descent

In [None]:
trained_model= model.fit(X_train, y_train, epochs=14, batch_size=35)

In [None]:
ypred = model.predict(X_test)

In [None]:
# what y_pred has is a list of probabilities associated with each class
# we want the max of those so that we know which class is more probable
y_pred[0]

In [None]:
import numpy as np
y_pred=[]
for i in ypred:
  y_pred.append(np.argmax(i))   # to get the maximum of the probabilities

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
print(cm)
print("The model accuracy is", accuracy )

[[4539  496]
 [ 757 4208]]
The model accuracy is 0.8747


# **Some Results**

In [None]:
# (flatten, (D,784,relu),(D,2,softmax))    87.84
# (flatten, (D,10, relu),(D,784,relu),(D,2,softmax))    85.1
# (flatten, (D,784,relu),(D,2,sigmoid))    87.88%
# (flatten, (D,784,sigmoid),(D,2,softmax))    87.74%