In [1]:
import gensim.downloader as api

In [2]:
glv=api.load("glove-wiki-gigaword-50")



In [3]:
!kaggle datasets download lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 19% 5.00M/25.7M [00:00<00:00, 35.0MB/s]
100% 25.7M/25.7M [00:00<00:00, 115MB/s] 


In [4]:
!unzip -o imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

In [22]:
print(np.__version__)
print(pd.__version__)
print(tf.__version__)

1.26.4
2.1.4
2.17.0


In [6]:
df=pd.read_csv("/content/IMDB Dataset.csv")

In [None]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [7]:
df.sentiment=df.sentiment.map({"positive":1,"negative":0})

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [9]:
# Install SpaCy
!pip install -U spacy

# Install the en_core_web_lg model
!python -m spacy download en_core_web_lg


Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [10]:
df.shape

(50000, 2)

In [None]:
df.sentiment.value_counts()

sentiment
1    25000
0    25000
Name: count, dtype: int64

In [11]:
import re
def remove_html_tags(text):
    clean = re.compile('<.*?>')
    return re.sub(clean, '', text)
df['review']=df['review'].apply(remove_html_tags)

In [14]:
import re
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)#+ for one or more of a character here it is space
    return text.strip().lower()

In [12]:
def remove_whitespace(text):
    return  " ".join(text.split())
df['review']=df['review'].apply(remove_whitespace)


In [13]:
import inflect

# Assuming you want to replace numbers with words in the 'email' column
p = inflect.engine()
def replace_numbers_with_words(text):
    words = []
    for word in text.split():
        # Check if the word is a number
        if word.isdigit():
            # Convert the number to its word representation
            words.append(p.number_to_words(word))
        else:
            words.append(word)
    # Join the words back into a single string
    return ' '.join(words)

# Apply the function to the 'email' column
df['review']=df['review'].apply(replace_numbers_with_words)


In [20]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['review'])
sequences = tokenizer.texts_to_sequences(df['review'])

In [None]:
tokenizer.word_index

## FOR MACHINE LEARNING PART WHERE KUCH NHI LGTA VOCAB LENGTH,YE SAB SO DO JUST YE
## WALA FUNCTION

In [21]:
import pickle
with open('tokenizer.pkl', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [18]:

import spacy
nlp = spacy.load("en_core_web_lg") # if this fails then run "python -m spacy download en_core_web_lg" to download that model

def preprocess_and_vectorize(text):
    # remove stop words and lemmatize the text
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)#lemma means lemmatisation


    return glv.get_mean_vector(filtered_tokens)



In [23]:
#this query takes few minutes, so go get some walk :)

df['vector'] = df['review'].apply(lambda text: preprocess_and_vectorize(text))

In [25]:
from sklearn.model_selection import train_test_split


#Do the 'train-test' splitting with test size of 20% with random state of 2022 and stratify sampling too
X_train, X_test, y_train, y_test = train_test_split(
    df.vector.values,
    df.sentiment,
    test_size=0.2, # 20% samples will go to test dataset
    random_state=2022,
    stratify=df.sentiment
          )

In [26]:
print("Shape of X_train before reshaping: ", X_train.shape)
print("Shape of X_test before reshaping: ", X_test.shape)


X_train_2d = np.stack(X_train)
X_test_2d =  np.stack(X_test)
# as ml model take 2D
print("Shape of X_train after reshaping: ", X_train_2d.shape)
print("Shape of X_test after reshaping: ", X_test_2d.shape)

Shape of X_train before reshaping:  (40000,)
Shape of X_test before reshaping:  (10000,)
Shape of X_train after reshaping:  (40000, 50)
Shape of X_test after reshaping:  (10000, 50)


In [27]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

#1. creating a GradientBoosting model object
clf = GradientBoostingClassifier()

#2. fit with all_train_embeddings and y_train
clf.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred = clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.76      0.77      0.76      5000
           1       0.76      0.76      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000



In [64]:
import pickle
with open('moviereview.pkl','wb') as file:
  pickle.dump(clf,file)


In [62]:
test_news = [
    "great film"
]

test_news_vectors = [preprocess_and_vectorize(n) for n in test_news]
clf.predict(test_news_vectors)

array([1])

In [71]:
from xgboost import XGBClassifier
xgb= XGBClassifier()

#2. fit with all_train_embeddings and y_train
xgb.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred= clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.76      0.77      0.76      5000
           1       0.76      0.76      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000

[[3831 1169]
 [1202 3798]]


In [72]:
print(type(xgb))


<class 'xgboost.sklearn.XGBClassifier'>


In [74]:
test_news = [
    "great film"
]

test_news_vectors = [preprocess_and_vectorize(n) for n in test_news]
xgb.predict(test_news_vectors)

array([1])

In [75]:
from sklearn.ensemble import RandomForestClassifier
rf= RandomForestClassifier()

#2. fit with all_train_embeddings and y_train
rf.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
y_pred= clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.76      0.77      0.76      5000
           1       0.76      0.76      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000

[[3831 1169]
 [1202 3798]]


In [76]:
test_news = [
    "great film"
]

test_news_vectors = [preprocess_and_vectorize(n) for n in test_news]
rf.predict(test_news_vectors)

array([1])

In [34]:
from sklearn.tree import DecisionTreeClassifier
rf=  DecisionTreeClassifier()

#2. fit with all_train_embeddings and y_train
rf.fit(X_train_2d, y_train)


#3. get the predictions for all_test_embeddings and store it in y_pred
rf= clf.predict(X_test_2d)


#4. print the classfication report
print(classification_report(y_test, y_pred))

from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

              precision    recall  f1-score   support

           0       0.76      0.77      0.76      5000
           1       0.76      0.76      0.76      5000

    accuracy                           0.76     10000
   macro avg       0.76      0.76      0.76     10000
weighted avg       0.76      0.76      0.76     10000

[[3831 1169]
 [1202 3798]]


In [33]:
import pickle
with open('moviereview.pkl','wb') as file:
  pickle.dump(rf,file)


FileNotFoundError: [Errno 2] No such file or directory: 'moviereview.pklrb'

AttributeError: 'numpy.ndarray' object has no attribute 'predict'

In [None]:
max_length = max(len(seq) for seq in sequences)  # Maximum length of sequences
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

In [None]:
vocab_size,max_length

(125344, 2466)

In [None]:
X = pad_sequences(sequences, maxlen=max_length)
len(X)

50000

In [None]:
embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))

word_index = tokenizer.word_index
for word, i in word_index.items():
    if word in glv:
        embedding_matrix[i] = glv[word]

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size,
                    output_dim=embedding_dim,
                    weights=[embedding_matrix],
                    input_length=max_length,
                    trainable=False))  # Use pre-trained embeddings as-is
model.add(LSTM(units=64))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Example training data
y = np.array(df['sentiment'])
model.fit(X, y, epochs=20,batch_size=128)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.src.callbacks.History at 0x790d463bbe80>

In [None]:

glv["boring"]

array([-0.035674, -0.41765 , -0.44104 , -0.35455 , -0.28748 , -0.25642 ,
        0.13326 ,  0.020065, -0.66321 ,  0.12049 , -1.0205  , -0.029451,
       -0.25333 ,  0.56927 ,  0.51631 ,  0.11365 ,  0.33299 ,  1.074   ,
        0.070584, -0.75984 , -0.2908  ,  0.56506 ,  0.54392 ,  0.66245 ,
        1.1402  , -0.56075 , -0.99398 ,  1.1002  ,  1.1428  , -0.1684  ,
        1.3748  , -0.45543 ,  0.38396 , -0.011839, -0.087978,  0.32026 ,
        0.28684 ,  0.88902 , -0.18163 , -0.54099 , -0.16129 , -0.38781 ,
        0.042476,  1.313   ,  0.20165 ,  0.12567 ,  0.37838 , -0.041809,
        0.37791 ,  0.98184 ], dtype=float32)

In [None]:
glv.most_similar("good")

[('better', 0.9284391403198242),
 ('really', 0.9220623970031738),
 ('always', 0.9165270924568176),
 ('sure', 0.903351366519928),
 ('something', 0.9014206528663635),
 ('think', 0.8982065320014954),
 ('way', 0.8953989744186401),
 ('thing', 0.894504964351654),
 ('little', 0.8941226005554199),
 ('very', 0.8919912576675415)]

In [None]:
df['review'].iloc[3]

"Basically there's a family where a little boy (Jake) thinks there's a zombie in his closet & his parents are fighting all the time.This movie is slower than a soap opera... and suddenly, Jake decides to become Rambo and kill the zombie.OK, first of all when you're going to make a film you must Decide if its a thriller or a drama! As a drama the movie is watchable. Parents are divorcing & arguing like in real life. And then we have Jake with his closet which totally ruins all the film! I expected to see a BOOGEYMAN similar movie, and instead i watched a drama with some meaningless thriller spots.3 out of ten just for the well playing parents & descent dialogs. As for the shots with Jake: just ignore them."

In [None]:
def preprocess_text(text, tokenizer, max_length):

    sequence = tokenizer.texts_to_sequences([text])

    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    return padded_sequence

def predict_sentiment(text, model, tokenizer, max_length):

    preprocessed_text = preprocess_text(text, tokenizer, max_length)

    prediction = model.predict(preprocessed_text)

    sentiment = 'Positive' if prediction[0] > 0.5 else 'Negative'
    return sentiment

user_input = input("Enter your movie review: ")
sentiment = predict_sentiment(user_input, model, tokenizer, max_length)
print(f"The sentiment of the review is: {sentiment}")

Enter your movie review: worst
The sentiment of the review is: Negative


In [None]:
model.save("movie_review.keras")

In [None]:
from tensorflow.keras.models import load_model

# Load the model from the .keras file
loaded_model = load_model("movie_review.keras")


In [None]:
def preprocess_text(text, tokenizer, max_length):

    sequence = tokenizer.texts_to_sequences([text])

    padded_sequence = pad_sequences(sequence, maxlen=max_length)
    return padded_sequence

def predict_sentiment(text, model, tokenizer, max_length):

    preprocessed_text = preprocess_text(text, tokenizer, max_length)

    prediction = loaded_model.predict(preprocessed_text)

    sentiment = 'Positive' if prediction[0] > 0.5 else 'Negative'
    return sentiment

user_input = input("Enter your movie review: ")
sentiment = predict_sentiment(user_input, model, tokenizer, max_length)
print(f"The sentiment of the review is: {sentiment}")

Enter your movie review: movie was awesome
The sentiment of the review is: Positive


In [None]:
from google.colab import files
files.download('movie_review.keras')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>