In [2]:
import numpy as np
import pandas as pd

In [3]:
test_path = '/content/drive/MyDrive/Colab Notebooks/sentiment analysis/test.tsv'
train_path ='/content/drive/MyDrive/Colab Notebooks/sentiment analysis/train.tsv'

In [4]:
train_data = pd.read_csv(train_path ,sep='\t')
train_data.sample(5)

Unnamed: 0,PhraseId,SentenceId,Phrase,Sentiment
152719,152720,8333,will probably sink the film for anyone who doe...,1
43532,43533,2107,Daughter From Danang,2
45802,45803,2226,as smart,2
65882,65883,3341,to portray themselves in the film,2
66067,66068,3351,an opportunity to strongly present some profou...,1


In [5]:
# Convert to lower case for Model understanding
train_data['Phrase'] = train_data['Phrase'].str.lower()
train_data['Phrase'].sample(5)

Unnamed: 0,Phrase
46004,absurd
129536,"waited in a doctor 's office , emergency room ..."
90983,plumbed by martin scorsese
6445,eerie atmosphere
69553,to enjoy yourselves without feeling conned


In [6]:
import string
def remove_punctuation(x):
  return x.translate(str.maketrans('', '', string.punctuation))

In [7]:
train_data['Phrase'] = train_data['Phrase'].apply(remove_punctuation)

In [8]:
train_data['Phrase'].isnull().sum()

np.int64(0)

In [9]:
train_data['Phrase'] = train_data['Phrase'].fillna('')

In [10]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [11]:

def remove_stopwords(text):
    if not isinstance(text, str):  # extra safety check
        return ''
    return ' '.join([word for word in text.split() if word not in stop_words])

# Apply cleaning
train_data['Phrase'] = train_data['Phrase'].apply(remove_stopwords)


In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Convert text into numerical features
tfidf = TfidfVectorizer(max_features=1000)  # you can increase to 5000 for better accuracy
X = tfidf.fit_transform(train_data['Phrase']).toarray()

# Target labels (make sure this column exists)
y = train_data['Sentiment']

print("TF-IDF shape:", X.shape)


TF-IDF shape: (156060, 1000)


In [18]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Split data correctly
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train fast model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.5464564910931693

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.02      0.03      1416
           1       0.49      0.08      0.14      5527
           2       0.55      0.96      0.70     15639
           3       0.54      0.21      0.31      6707
           4       0.61      0.04      0.07      1923

    accuracy                           0.55     31212
   macro avg       0.59      0.26      0.25     31212
weighted avg       0.55      0.55      0.45     31212



In [19]:
feature_names = tfidf.get_feature_names_out()

# For each sentiment class, get top words
for i, class_label in enumerate(model.classes_):
    top10 = np.argsort(model.feature_log_prob_[i])[-10:]  # top 10 words
    print(f"\nTop words for Sentiment {class_label}:")
    print([feature_names[j] for j in top10])


Top words for Sentiment 0:
['characters', 'even', 'worst', 'dull', 'one', 'like', 'nt', 'film', 'bad', 'movie']

Top words for Sentiment 1:
['characters', 'story', 'bad', 'little', 'much', 'one', 'like', 'film', 'movie', 'nt']

Top words for Sentiment 2:
['much', 'life', 'lrb', 'story', 'nt', 'like', 'rrb', 'one', 'movie', 'film']

Top words for Sentiment 3:
['comedy', 'like', 'characters', 'love', 'story', 'one', 'funny', 'good', 'movie', 'film']

Top words for Sentiment 4:
['story', 'great', 'good', 'performance', 'performances', 'one', 'best', 'funny', 'movie', 'film']
