In [1]:
#importing necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords

#Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
#load the data
data = pd.read_csv('imdb_data.csv')

#preprocess the data
def clean_text(text : str):
    #remove special character, number and extra spaces
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'\d', '', text)
    text = text.lower()
    
    text = ' '.join([word for word in text.split() if word not in stop_words])
    
    return text

data['cleaned_review'] = data['review'].apply(clean_text)

data['sentiment'] = data['sentiment'].map({'positive': 1, 'negative': 0})



In [6]:
#split the data into training and test size
X = data['cleaned_review']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
#Convert the text data to TF-IDF feature
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)


In [8]:
#train the model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)


In [13]:
#make prediction
y_pred = model.predict(X_test_tfidf)


  (0, 5)	0.0639708175374691
  (0, 63)	0.07525375446489435
  (0, 132)	0.09004575350055136
  (0, 133)	0.05097058345253197
  (0, 211)	0.08095828113004118
  (0, 226)	0.08529135457917432
  (0, 281)	0.06908256409397066
  (0, 287)	0.05739719256928741
  (0, 295)	0.09418842662501578
  (0, 299)	0.08088371011068239
  (0, 503)	0.14523719118975786
  (0, 533)	0.10050546446845468
  (0, 537)	0.08893146521239909
  (0, 605)	0.06021252120859947
  (0, 687)	0.0715217418769738
  (0, 710)	0.12505271036998777
  (0, 751)	0.09026939919356683
  (0, 852)	0.05486741715271179
  (0, 929)	0.08717188870011247
  (0, 944)	0.09238810185551712
  (0, 983)	0.03837393930511749
  (0, 987)	0.09703178743878457
  (0, 993)	0.054627525173989125
  (0, 1203)	0.08206792473921475
  (0, 1224)	0.11196389290236995
  :	:
  (0, 4286)	0.07220632743390942
  (0, 4297)	0.09551444044690306
  (0, 4430)	0.1432079691554263
  (0, 4467)	0.09717092133005334
  (0, 4468)	0.08430542424205864
  (0, 4472)	0.09115568429999821
  (0, 4474)	0.0398379143902582

In [10]:
accuracy = accuracy_score(y_test, y_pred)

print(f"Accuracy : {accuracy * 100:.2f}%")
print("\nClassification Report: ")
print(classification_report(y_test, y_pred))

Accuracy : 89.21%

Classification Report: 
              precision    recall  f1-score   support

           0       0.90      0.88      0.89      4961
           1       0.88      0.90      0.89      5039

    accuracy                           0.89     10000
   macro avg       0.89      0.89      0.89     10000
weighted avg       0.89      0.89      0.89     10000



In [17]:
def predict_sentiment(review):
    #clean the input review
    cleaned_review = clean_text(review)
    
    #transform
    review_tfidf = vectorizer.transform([cleaned_review])
    
    sentiment = model.predict(review_tfidf)
    
    if sentiment == 1:
        return "Positive"
    else:
        return "Negative"
   
review_input = input("Enter your review: ")
print(f"Review: {review_input}")
print(f"Sentiment: {predict_sentiment(review_input)}")    

Review: very bad movie i don't like it
Sentiment: Negative
