In [1]:
# Open the original CSV file
with open('IMDB.csv', 'r', encoding='utf-8') as file:
    lines = file.readlines()

# Filter out problematic lines
filtered_lines = [line for line in lines if len(line.split(',')) == 2]

# Save the filtered lines to a new CSV file
with open('filtered.csv', 'w', encoding='utf-8') as file:
    file.writelines(filtered_lines)


In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

df = pd.read_csv('filtered.csv')

In [4]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def preprocess_text(text):
  if isinstance(text, str):
    text = re.sub('<[^<]+?>', ' ', text)
    text = text.lower()

    tokens = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]

    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    preprocessed_text = ' '.join(tokens)
    return preprocessed_text
  else:
      return ''

df['Preprocessed_Text'] = df['review'].apply(preprocess_text)

print(df['Preprocessed_Text'].head(10))


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


0    fantastic movie three prisoner become famous ....
1    worst movie saw worldfest also received least ...
2    protocol implausible movie whose saving grace ...
3    unmarried woman named stella ( bette midler ) ...
4    probably worst movie ever seen life ! ! stupid...
5    oh no one attack japanese ghost girl movie ......
6    's terrific funny movie n't make smile . pity ...
7    one finest movie ever seen .... stark scenery ...
8    movie get 10 lot gore it.who care plot acting....
9    movie well directed . almost totally disregard...
Name: Preprocessed_Text, dtype: object


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=1000)

X_tfidf = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])

print("Shape of TF-IDF matrix:", X_tfidf.shape)


Shape of TF-IDF matrix: (165, 1000)


In [6]:
feature_names = tfidf_vectorizer.get_feature_names_out()
print("Feature names:", feature_names)

Feature names: ['10' '100' '16' '17' '25' '30' '42' '80' '90' 'abbey' 'ability' 'able'
 'absolutely' 'accident' 'account' 'accuracy' 'achievement' 'acid'
 'across' 'act' 'acted' 'acting' 'action' 'actor' 'actress' 'actual'
 'actually' 'add' 'added' 'addict' 'advice' 'advise' 'afternoon'
 'afterwards' 'agent' 'ago' 'air' 'alien' 'almost' 'alone' 'along'
 'already' 'also' 'although' 'always' 'amazing' 'american' 'amitabh'
 'amount' 'amuses' 'annoying' 'another' 'anymore' 'anyone' 'anything'
 'anyway' 'apart' 'appeal' 'appearance' 'appears' 'applause' 'appreciate'
 'archie' 'argument' 'ariauna' 'around' 'art' 'asked' 'assaulted'
 'attempt' 'attractive' 'audience' 'australian' 'average' 'avoid' 'award'
 'away' 'awful' 'back' 'bad' 'badly' 'banker' 'based' 'bat' 'beautiful'
 'became' 'become' 'becomes' 'bed' 'beginning' 'behind' 'believe' 'best'
 'better' 'big' 'bike' 'bill' 'birthday' 'bit' 'black' 'blah' 'bollywood'
 'book' 'bored' 'boring' 'box' 'boy' 'boyfriend' 'break' 'brings'
 'briti

In [7]:
from sklearn.metrics import accuracy_score, classification_report
df.dropna(inplace=True)

X_tfidf = tfidf_vectorizer.fit_transform(df['Preprocessed_Text'])
y = df['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

logistic_regression_model = LogisticRegression()

logistic_regression_model.fit(X_train, y_train)

y_pred_train = logistic_regression_model.predict(X_train)
y_pred_test = logistic_regression_model.predict(X_test)

train_accuracy = accuracy_score(y_train, y_pred_train)
test_accuracy = accuracy_score(y_test, y_pred_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

print("Classification Report for Test Data:")
print(classification_report(y_test, y_pred_test))


Train Accuracy: 1.0
Test Accuracy: 0.8787878787878788
Classification Report for Test Data:
              precision    recall  f1-score   support

    negative       1.00      0.67      0.80        12
    positive       0.84      1.00      0.91        21

    accuracy                           0.88        33
   macro avg       0.92      0.83      0.86        33
weighted avg       0.90      0.88      0.87        33



In [8]:
import random
random.seed(42)
random_indices = random.sample(range(len(df)), 5)
random_examples = df.iloc[random_indices]

for index, row in random_examples.iterrows():
  example_text = row['review']
  preprocessed_example_text = preprocess_text(example_text)
  example_text_vectorized = tfidf_vectorizer.transform([preprocessed_example_text])
  predicted_sentiment = logistic_regression_model.predict(example_text_vectorized)[0]
  actual_sentiment = row['sentiment']
  print("Example Text:", example_text)
  print("Actual Sentiment:", actual_sentiment)
  print("Predicted Sentiment:", predicted_sentiment)
  print("-------------------------------")


Example Text: I really like Ryan Reynolds and Hope Davis and I actually had high hopes watching this last night on DVD. Mainly as I try to avoid reviews until I watch something myself and form my own opinion Big mistake! My 2 /10 is for the first segment which in fairness is actually quite decent and if they had made the movie about the characters in section 1 alone it may have risen above the 5/10 mark.Once it moved into TV 'reality show' territory it stank to high heaven. Ryan Reynolds captured the essence of an actor on the edge wonderfully but as a gay TV writer and famous games creator / devoted family man he was definitely less effective. From the blurb on the box I expected a flashback thriller along the lines of 'Memento' - unfortunately this is nowhere near that standard of movie.
Actual Sentiment: negative
Predicted Sentiment: negative
-------------------------------
Example Text: Steven what have you done you have hit an all new low. It is weird since Steven's last film shad