In [None]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
Downloading imdb-dataset-of-50k-movie-reviews.zip to /content
 97% 25.0M/25.7M [00:00<00:00, 82.5MB/s]
100% 25.7M/25.7M [00:00<00:00, 71.4MB/s]


In [None]:
!unzip imdb-dataset-of-50k-movie-reviews.zip

Archive:  imdb-dataset-of-50k-movie-reviews.zip
  inflating: IMDB Dataset.csv        


In [None]:
import pandas as pd
df=pd.read_csv('IMDB Dataset.csv')
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [None]:
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [None]:
import numpy as np
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
import re
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
lemmatizer=WordNetLemmatizer()
def proprocess_text(text):
  text=text.lower()
  text=re.sub(r'<.*?>',"",text)
  text=re.sub('[^a-zA-Z\s]',' ',text)
  words=nltk.word_tokenize(text)
  words=[lemmatizer.lemmatize(word) for word in words if word not in set(stopwords.words('english'))]
  text=' '.join(words)
  return text



In [None]:
df['cleaning_review']=df['review'].apply(proprocess_text)

In [None]:
def handle_negations(text):
  text=re.sub(r"\bont\b|\bno/b|\bnever\b",'not',text)
  return text


In [None]:
df['cleaning_review'] = df['cleaning_review'].apply(handle_negations)

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from collections.abc import Sequence

In [None]:
from collections.abc import Sequence
Tokenizer=Tokenizer(num_words=5000)
Tokenizer.fit_on_texts(df['cleaning_review'])
Sequence=Tokenizer.texts_to_sequences(df['cleaning_review'])
x=pad_sequences(Sequence,maxlen=100)

In [None]:
x.shape

(50000, 100)

In [None]:
y=df['sentiment']
y.shape

(50000,)

In [None]:
from sklearn.model_selection import train_test_split #Fixed typo in module name

In [None]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((40000, 100), (10000, 100), (40000,), (10000,))

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,LSTM,Dense,Dropout,Input

In [None]:
model=Sequential([
    Input(shape=(100,)),
    Embedding(5000,100,input_length=100),
    LSTM(128,return_sequences=True),
    Dropout(0.4),
    LSTM(64),
    Dropout(0.2),
    Dense(1,activation='sigmoid')
])



In [None]:
model.summary()

In [None]:
from tensorflow.keras.optimizers import Adam


In [None]:
model.compile(optimizer=Adam(learning_rate=0.001),loss='binary_crossentropy',metrics=['accuracy'])

In [None]:
from tensorflow.keras.callbacks import EarlyStopping

In [None]:
early_stopping=EarlyStopping(monitor='val_loss',patience=3,restore_best_weights=True)

In [None]:
history=model.fit(x_train,y_train,epochs=10,validation_split=0.2,callbacks=[early_stopping])

Epoch 1/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 13ms/step - accuracy: 0.9236 - loss: 0.2022 - val_accuracy: 0.8685 - val_loss: 0.3179
Epoch 2/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 13ms/step - accuracy: 0.9440 - loss: 0.1574 - val_accuracy: 0.8627 - val_loss: 0.3749
Epoch 3/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 13ms/step - accuracy: 0.9607 - loss: 0.1173 - val_accuracy: 0.8530 - val_loss: 0.3977
Epoch 4/10
[1m1000/1000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 12ms/step - accuracy: 0.9667 - loss: 0.1013 - val_accuracy: 0.8553 - val_loss: 0.4607


In [None]:
loss,accuracy=model.evaluate(x_test,y_test)
print(f'loss:{loss},accuracy:{accuracy}')

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.8697 - loss: 0.3097
loss:0.3059264123439789,accuracy:0.8726000189781189


In [None]:
from sklearn.metrics import classification_report
y_pred=model.predict(x_test)
y_pred=(y_pred>0.5).astype(int)
print("\classification Report:")
print(classification_report(y_test,y_pred))

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step
\classification Report:
              precision    recall  f1-score   support

           0       0.88      0.86      0.87      4961
           1       0.87      0.89      0.88      5039

    accuracy                           0.87     10000
   macro avg       0.87      0.87      0.87     10000
weighted avg       0.87      0.87      0.87     10000



In [None]:
predection=model.predict(x_test)
y_pred=np.argmax(predection,axis=1)
y

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step


Unnamed: 0,sentiment
0,1
1,1
2,1
3,0
4,1
...,...
49995,1
49996,0
49997,0
49998,0


In [None]:
tf.keras.models.save_model(model,'model.h5')



In [None]:
from google.colab import files
files.download('model.h5')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import tensorflow as tf
model=tf.keras.models.load_model('model.h5')



In [None]:
import pickle
tokenizer = Tokenizer
with open('tokenizer.pkl','wb') as f:
  pickle.dump(Tokenizer,f)

In [None]:
def preprocess_given_text(text,tokenizer):
  sequence=tokenizer.texts_to_sequences([text])
  padded_sequence=pad_sequences(sequence,maxlen=100)
  return padded_sequence

In [None]:
def predict_sentiment(text):
  padded_sequence=preprocess_given_text(text,tokenizer)
  prediction=model.predict(padded_sequence)
  positive_confidence=prediction[0][0]*100
  if positive_confidence > 50 :
    return f"Positive sentiment with confidence {positive_confidence:.2f}%"
  else:
    return f"Negative sentiment with confidence {100- positive_confidence:.2f}%"

In [None]:
predict_sentiment("I loved every minute of this film; it was truly a work of art.")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 196ms/step


'Positive sentiment with confidence 88.59%'

In [None]:
!pip install gradio



In [None]:
import gradio as gr
import pickle
import matplotlib.pyplot as plt
model=tf.keras.models.load_model('model.h5')

with open('tokenizer.pkl','rb') as f:
  tokenizer=pickle.load(f)

def classify_sentiment(text):
  sequences=tokenizer.texts_to_sequences([text])
  padded_sequence=tf.keras.preprocessing.sequence.pad_sequences(sequences,maxlen=100)
  prediction=model.predict(padded_sequence)

  positive_confidence= prediction[0][0]*100
  negative_confidence= (1 - prediction[0][0]) * 100

  if positive_confidence > 70 :
    sentiment="Positive"
  elif negative_confidence > 70 :
    sentiment="Negative"
  elif 40 < positive_confidence < 60 and 40 < negative_confidence < 60 :
    sentiment="Neutral"
  else:
    sentiment="Neutral"

  labels=['Positive','Negative']
  confidences=[positive_confidence,negative_confidence]


  return sentiment,f"{positive_confidence:.2f}",f"{negative_confidence:.2f}","sentiment_distribution.png"


iface = gr.Interface(
    fn=classify_sentiment,
    inputs=gr.Textbox(label="Enter your review here"),
    outputs=[
        gr.Textbox(label="Sentiment analysis result"),
        gr.Textbox(label="Positive Confidence"),
        gr.Textbox(label="Negative Confidence"),

    ],
    title=" Movie review sentiment analysis",
    description="Enter a review to analyze its sentiment."
    )

iface.launch(debug=True)



Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
Running on public URL: https://b37e912ca7793c2c41.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://b37e912ca7793c2c41.gradio.live


