In [21]:
import pandas as pd
import numpy as np

In [22]:
#Inserting the dataset

In [24]:
df = pd.read_csv("movie_reviews_large.csv", encoding='ISO-8859-1')

In [25]:
df

Unnamed: 0,review,sentiment
0,The movie was absolutely fantastic! I loved ev...,positive
1,Terrible movie. Waste of time.,negative
2,Amazing direction and brilliant acting!,positive
3,Poor script and bad acting ruined it.,negative
4,"An outstanding experience, a must-watch!",positive
5,Not my cup of tea. Boring and slow.,negative
6,A truly heartwarming and emotional film.,positive
7,I didnt enjoy the plot at all.,negative
8,Great cast and very engaging!,positive
9,Felt like it would never end. Dull.,negative


In [26]:
df.head()

Unnamed: 0,review,sentiment
0,The movie was absolutely fantastic! I loved ev...,positive
1,Terrible movie. Waste of time.,negative
2,Amazing direction and brilliant acting!,positive
3,Poor script and bad acting ruined it.,negative
4,"An outstanding experience, a must-watch!",positive


In [27]:
#Data cleaning

In [28]:
import re
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [29]:
def clean_text(text):
    text = text.lower()
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    text = re.sub(r'\d+', '', text)
    
    stop_words = set(stopwords.words('english'))
    words = text.split()
    words = [word for word in words if word not in stop_words]
    
    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]
    
    return ' '.join(words)


In [30]:
df['cleaned_review'] = df['review'].apply(clean_text)
df[['review', 'cleaned_review']].head()

Unnamed: 0,review,cleaned_review
0,The movie was absolutely fantastic! I loved ev...,movi absolut fantast love everi moment
1,Terrible movie. Waste of time.,terribl movi wast time
2,Amazing direction and brilliant acting!,amaz direct brilliant act
3,Poor script and bad acting ruined it.,poor script bad act ruin
4,"An outstanding experience, a must-watch!",outstand experi mustwatch


In [31]:
#Importing and Applying TF-IDF Vectorizer

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X = tfidf.fit_transform(df['cleaned_review'])
print("TF-IDF matrix shape:", X.shape)

TF-IDF matrix shape: (50, 138)


In [33]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(df['sentiment'])
print("positive =", le.transform(['positive'])[0])
print("negative =", le.transform(['negative'])[0])

positive = 1
negative = 0


In [34]:
#Import and train the model

In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)
from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))

print("\nClassification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.6

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.50      0.60         6
           1       0.50      0.75      0.60         4

    accuracy                           0.60        10
   macro avg       0.62      0.62      0.60        10
weighted avg       0.65      0.60      0.60        10



In [38]:
#predicting sentiment of new reviews

In [39]:
def predict_review_sentiment(review_text):
    cleaned = clean_text(review_text)
    
    vector = tfidf.transform([cleaned])
    
    prediction = model.predict(vector)[0]
    
    sentiment = le.inverse_transform([prediction])[0]
    
    return sentiment


In [40]:
print(predict_review_sentiment("This movie was a masterpiece! I loved every second."))

positive


In [41]:
print(predict_review_sentiment("It was boring and the plot was weak. Waste of time."))

negative
