In [2]:
import pandas as pd
import numpy as np
import string
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder
import pickle

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, GlobalMaxPooling1D, Dropout
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to C:\Users\Qaswa
[nltk_data]     Chaudhary\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Qaswa
[nltk_data]     Chaudhary\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Qaswa
[nltk_data]     Chaudhary\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
# Loading Dataset
df = pd.read_csv('dataset.tsv', delimiter='\t')
df.head()

Unnamed: 0,sentiment,review
0,1,But it's really slow to catch on . For me a mo...
1,1,"To me, it's controversial how this movie has s..."
2,1,"The movie is a tricky and the story is good, b..."
3,0,I can not believe that this movie received all...
4,2,Usually the debate is wether godfather or Good...


In [5]:
df.shape

(32745, 2)

In [6]:
df.isnull().sum()

sentiment    0
review       0
dtype: int64

In [7]:
# text preprocessing
lemmatization = WordNetLemmatizer()
stop_words    = set(stopwords.words('english'))

# pre_processing function 

def text_preprocessing(text):
    text  = text.lower()
    text  = text.translate(str.maketrans('','', string.punctuation))
    token = nltk.word_tokenize(text)
    token = [lemmatization.lemmatize(word) for word in token if word not in stop_words]
    return ' '.join(token)

df['Clean_Text'] = df['review'].apply(text_preprocessing)

In [8]:

# --- 2. Preprocessing ---
max_words = 5000
max_len = 200

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['Clean_Text'])

X = tokenizer.texts_to_sequences(df['Clean_Text'])
X = pad_sequences(X, maxlen=max_len, padding='post', truncating='post')


In [9]:
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english', ngram_range=(1, 2))

# Fit and transform the text data
X_tfidf = vectorizer.fit_transform(df['Clean_Text'])

# Convert to dense matrix (if needed)
X_tfidf_dense = X_tfidf.toarray()


In [10]:
# label encoding 

le = LabelEncoder()
y  = le.fit_transform(df['sentiment'])

In [11]:
# Train-Test-Split

X_train , X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

In [12]:
num_classes = len(le.classes_)

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.5))
model.add(Dense(10, activation='relu'))
model.add(Dense(num_classes, activation='softmax'))

# Use sparse categorical crossentropy for integer labels
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [13]:
from tensorflow.keras.callbacks import EarlyStopping

early_stop = EarlyStopping(
    monitor='val_loss',      
    patience=3,              
    restore_best_weights=True
)

model.fit(
    X_train, y_train,
    epochs=5,
    batch_size=32,
    validation_split=0.1,
    callbacks=[early_stop]
)


Epoch 1/5
[1m737/737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 48ms/step - accuracy: 0.4350 - loss: 1.0295 - val_accuracy: 0.5916 - val_loss: 0.8420
Epoch 2/5
[1m737/737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 46ms/step - accuracy: 0.6380 - loss: 0.7811 - val_accuracy: 0.6229 - val_loss: 0.7888
Epoch 3/5
[1m737/737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 47ms/step - accuracy: 0.7132 - loss: 0.6568 - val_accuracy: 0.6275 - val_loss: 0.8033
Epoch 4/5
[1m737/737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 47ms/step - accuracy: 0.7818 - loss: 0.5370 - val_accuracy: 0.6134 - val_loss: 0.8484
Epoch 5/5
[1m737/737[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 46ms/step - accuracy: 0.8305 - loss: 0.4403 - val_accuracy: 0.6183 - val_loss: 0.9511


<keras.src.callbacks.history.History at 0x21890f5dc10>

In [14]:
# --- 7. Predictions ---
y_pred = model.predict(X_test)
y_pred_labels = y_pred.argmax(axis=1)



[1m205/205[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 11ms/step


In [15]:
print("Classes:", le.classes_)


Classes: [0 1 2]


In [16]:
print("Accuracy:", accuracy_score(y_test, y_pred_labels)*100)

Accuracy: 61.71934646510918


In [17]:
# Save model
model.save("model.h5")
model.save('model.h5')

# Save tokenizer
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)


