<a href="https://colab.research.google.com/github/WafulaNasombi/cnn-sentiment-analysis/blob/main/cnn.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Reshape


## Loading the data

In [None]:
df = pd.read_csv('IMDB Dataset.csv', encoding='UTF-8', on_bad_lines='skip', nrows=49000)
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [None]:
# preview a review
df['review'][1000]

"This movie is awful, I can't even be bothered to write a review on this garbage! All i will say it is one of the most boring films I've ever seen.<br /><br />And the acting is very bad. The boy who plays the main character really annoys me, he's got the same expression on his face through out the movie. I just want to slap him! Basically 80% of the movie is slow motion shots of skateboarders, weird music, and utter sh*t..<br /><br />Apparently I've got to write at least 10 lines of text to submit this comment, so I'll use up a few more lines by saying the lead character has got one of those faces you just want to slap!<br /><br />Meh i give up..THIS MOVIE SUCKS !!!!"

In [None]:
# value counts of the sentiments
df['sentiment'].value_counts()


Unnamed: 0_level_0,count
sentiment,Unnamed: 1_level_1
positive,24517
negative,24483


## Text Processing

In [None]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
print(stop_words)

{'to', 'am', 'between', 'myself', 'it', 'shan', 'not', 'than', 'should', 'them', 'they', 'by', 'out', 'you', 'no', 'same', 'after', 'wasn', 'over', 'had', 'have', 'be', 'all', 'can', 'once', 'ain', 'those', 'only', 'are', "wasn't", 'in', 're', "you'd", 'these', 'few', 'very', "haven't", 'just', 'does', 'itself', 'some', 'mightn', 'weren', 'their', 'haven', 'because', 'her', 'into', 'herself', 'such', 'nor', "hadn't", "shouldn't", 'ours', 've', 'ma', "won't", 'our', 'against', 'down', 'so', "you've", 'hers', 'did', 'at', "should've", 'were', 'didn', 'most', 'this', 'up', 'yours', "doesn't", "hasn't", 'his', 'through', 'above', 'where', 'himself', 'will', 'being', "she's", 'each', 'has', 'she', 'll', 't', "mightn't", 'and', 'is', "isn't", 'hasn', 'other', 'y', 'but', 'theirs', 's', 'until', 'while', 'of', 'whom', 'which', 'why', 'an', 'for', 'now', 'i', "mustn't", 'a', 'with', 'below', 'couldn', 'when', 'been', 'your', 'themselves', 'don', 'do', 'shouldn', 'yourself', 'who', 'if', 'as', 

In [None]:
example = "hello_phill 123!! abcd"
# using \w+
matches_w = re.findall(r'\w+', example)
print(matches_w)
# using \S+
matches_s = re.findall(r'\S+', example)
print(matches_s)

['hello_phill', '123', 'abcd']
['hello_phill', '123!!', 'abcd']


In [None]:
def preprocess_text(text) -> str:
  """
  Function to clean and preprocess the text
  """
  # lower case
  text = text.lower()
  # remove html tags
  text = re.sub(r'<.*?>', '', text)
  # remove non-alphabetical
  text = re.sub(r'[^a-z\s]', '', text)
  # remove stop words
  text = ' '.join([word for word in text.split() if word not in stop_words])
  # remove mentions
  text = re.sub(r'@\w+', '', text)
  # remove html links
  text = re.sub(r'http\S+|www.\S+', '', text)

  return text

# apply to review column
df['review'] = df['review'].apply(preprocess_text)

In [None]:
# preview a review after preprocessing
df['review'][1000]

'movie awful cant even bothered write review garbage say one boring films ive ever seenand acting bad boy plays main character really annoys hes got expression face movie want slap basically movie slow motion shots skateboarders weird music utter shtapparently ive got write least lines text submit comment ill use lines saying lead character got one faces want slapmeh give upthis movie sucks'

## Tokenization and padding

In [None]:
# tokenize the text
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['review'])
word_index = tokenizer.word_index
print(f'Vocabulary size: {len(word_index)}')

Vocabulary size: 211599


In [None]:
# convert text to sequences - CNN works with sequences of fixed length
X = tokenizer.texts_to_sequences(df['review'])

In [None]:
# pad sequences to have the same length
max_len = 500
X = pad_sequences(X, maxlen=max_len)

In [None]:
# convert sentiments to numpy array
y = df['sentiment'].map({'positive': 1, 'negative': 0}).values

## Split data into training and test

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Building the CNN model

In [None]:
print(X_train.shape)

(39200, 500)


In [None]:
model = Sequential()

In [None]:
# embedding layer to learn word embeddings
model.add(Embedding(input_dim=5001, output_dim=128, input_length=max_len))

# 1D convolutional layer
model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))

# GlobalMaxPooling to reduce dimensionality
model.add(GlobalMaxPooling1D())

# fully connected layers
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))  # output layer for binary classification

# compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])



## Train the CNN Model

In [None]:
history = model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 329ms/step - accuracy: 0.7634 - loss: 0.4751 - val_accuracy: 0.8765 - val_loss: 0.2922
Epoch 2/5
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m276s[0m 353ms/step - accuracy: 0.9257 - loss: 0.1992 - val_accuracy: 0.8826 - val_loss: 0.2848
Epoch 3/5
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m262s[0m 354ms/step - accuracy: 0.9768 - loss: 0.0883 - val_accuracy: 0.8861 - val_loss: 0.3166
Epoch 4/5
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m281s[0m 385ms/step - accuracy: 0.9965 - loss: 0.0252 - val_accuracy: 0.8832 - val_loss: 0.3889
Epoch 5/5
[1m613/613[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 317ms/step - accuracy: 0.9998 - loss: 0.0053 - val_accuracy: 0.8824 - val_loss: 0.4450


## Evaluating the Model

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy * 100:.2f}%')

[1m307/307[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 43ms/step - accuracy: 0.8768 - loss: 0.4518
Test Loss: 0.4450
Test Accuracy: 88.24%


## Make prediction on custom input

In [None]:
def user_review():
  review = input('Enter a review:\n')
  return review

In [None]:
def predict_sentiment():
  review = user_review()
  # preprocess the input review
  review = preprocess_text(review)
  #tokenize and pad the input
  sequence = tokenizer.texts_to_sequences([review])
  padded_sequence = pad_sequences(sequence, maxlen=max_len)
  # make prediction
  prediction = model.predict(padded_sequence)[0][0]

  # interpret the prediction
  if prediction > 0.5:
    return 'Positive'
  else:
    return 'Negative'

In [None]:
predict_sentiment()

Enter a review:
That movie was awesome! From the directors to the actors, those guys knew what they were doing! kudos
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 96ms/step


'Positive'

In [None]:
predict_sentiment()

Enter a review:
I love and hate this movie ...has good artist though script is shady
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step


'Positive'

## Save the model

In [None]:
model.save('my_model.keras')

## Extended Explantion

### Stemming

In [None]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
print(stemmer.stem('running'))
print(stemmer.stem('better'))
print(stemmer.stem('studies'))
print(stemmer.stem('studying'))

run
better
studi
studi


### Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('running', pos='v'))
print(lemmatizer.lemmatize('better', pos='a'))
print(lemmatizer.lemmatize('studies'))
print(lemmatizer.lemmatize('studying'))

run
good
study
studying


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


## Loading the Saved Model

In [None]:
from tensorflow.keras.models import load_model

In [None]:
# load the saved model
my_model = load_model('my_model.keras')

  saveable.load_own_variables(weights_store.get(inner_path))


## using the loaded model for predictions

In [None]:
# example data
input_review = 'That movie was awesome! From the directors to the actors, those guys knew what they were doing! kudos'

# preprocess the text
input_review = preprocess_text(input_review)

# preprcoess the input
sequence = tokenizer.texts_to_sequences([input_review])
padded_input = pad_sequences(sequence, maxlen=max_len)

# make prediction
prediction = my_model.predict(padded_input)[0][0]

# interpret the prediction
if prediction > 0.5:
  print('Positive')
else:
  print('Negative')

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Positive


## Challenge

* Instead of sequences, use TF-IDF and naive bayes.