In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [7]:
df_imdb = pd.read_csv('IMDB Dataset.csv')

df = df_imdb.iloc[:len(df_imdb) // 5]

print("First lines of the DataFrame:")
df.head()



First lines of the DataFrame:


Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [8]:
print("DataFrame size:", df.shape)
print("Column types:")
print(df.dtypes)


DataFrame size: (10000, 2)
Column types:
review       object
sentiment    object
dtype: object


In [9]:

if df.isnull().values.any():
    print("NaN values found, removing them.")
    df = df.dropna()
else:
    print("No NaN values found.")

No NaN values found.


In [12]:
print(f"First 5 reviews and their sentiments:\n")
for index, row in df.head().iterrows():
    print(f"Review: {row['review']}\nSentiment: {row['sentiment']}\n")

First 5 reviews and their sentiments:

Review: One of the other reviewers has mentioned that after watching just 1 Oz episode you'll be hooked. They are right, as this is exactly what happened with me.<br /><br />The first thing that struck me about Oz was its brutality and unflinching scenes of violence, which set in right from the word GO. Trust me, this is not a show for the faint hearted or timid. This show pulls no punches with regards to drugs, sex or violence. Its is hardcore, in the classic use of the word.<br /><br />It is called OZ as that is the nickname given to the Oswald Maximum Security State Penitentary. It focuses mainly on Emerald City, an experimental section of the prison where all the cells have glass fronts and face inwards, so privacy is not high on the agenda. Em City is home to many..Aryans, Muslims, gangstas, Latinos, Christians, Italians, Irish and more....so scuffles, death stares, dodgy dealings and shady agreements are never far away.<br /><br />I would sa

In [14]:
def count_words(text):
    return len(text.split())

df['words count'] = df['review'].apply(count_words)

print("DataFrame with words count:")
df[['review', 'words count']].head()

DataFrame with words count:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['words count'] = df['review'].apply(count_words)


Unnamed: 0,review,words count
0,One of the other reviewers has mentioned that ...,307
1,A wonderful little production. <br /><br />The...,162
2,I thought this was a wonderful way to spend ti...,166
3,Basically there's a family where a little boy ...,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",230


In [15]:
df

Unnamed: 0,review,sentiment,words count
0,One of the other reviewers has mentioned that ...,positive,307
1,A wonderful little production. <br /><br />The...,positive,162
2,I thought this was a wonderful way to spend ti...,positive,166
3,Basically there's a family where a little boy ...,negative,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,230
...,...,...,...
9995,"Fun, entertaining movie about WWII German spy ...",positive,148
9996,Give me a break. How can anyone say that this ...,negative,218
9997,This movie is a bad movie. But after watching ...,negative,228
9998,This is a movie that was probably made to ente...,negative,136


Preprocessing
--
1. create a function called simple_preprocessing. The function will take a text as argument and aply the following:

(you may need module re to clean the data)
- make the text lower case
- remove the html br tags
- remove urls
- remove hashtags and @ symbol
- remove punctuations
- tokenize the text using nltk or spaCy
- remove stopwords using nltk or spaCy
- return a string as the preprocessed text.

In [18]:
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import spacy

nlp = spacy.load('en_core_web_sm')

def simple_preprocessing(text):
    text = text.lower()
    text = re.sub(r'<br\s*/?>', ' ', text)
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[@#]\w+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    doc = nlp(text)
    tokens = [token.text for token in doc]
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return ' '.join(tokens)

In [19]:
df['review'] = df['review'].apply(simple_preprocessing)

df['review']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(simple_preprocessing)


0       one reviewers mentioned watching 1 oz episode ...
1       wonderful little production    filming techniq...
2       thought wonderful way spend time hot summer we...
3       basically family little boy jake thinks zombie...
4       petter matteis love time money visually stunni...
                              ...                        
9995    fun entertaining movie wwii german spy julie a...
9996    give break anyone say good hockey movie know m...
9997    movie bad movie watching endless series bad ho...
9998    movie probably made entertain middle school ea...
9999    smashing film filmmaking shows intense strange...
Name: review, Length: 10000, dtype: object

In [24]:
duplicates = df.duplicated(subset='review').sum()
print(f"Number of duplicated reviews: {duplicates}")

if duplicates > 0:
    df = df.drop_duplicates(subset='review')
    print(f"Duplicates removed. New number of duplicated reviews: {df.duplicated(subset='review').sum()}")


Number of duplicated reviews: 17
Duplicates removed. New number of duplicated reviews: 0


In [25]:
from nltk.stem import PorterStemmer

def stemming(text):
    stemmer = PorterStemmer()
    tokens = text.split()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return ' '.join(stemmed_tokens)

df['review'] = df['review'].apply(stemming)

df['review'].head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['review'] = df['review'].apply(stemming)


0    one review mention watch 1 oz episod hook righ...
1    wonder littl product film techniqu unassum old...
2    thought wonder way spend time hot summer weeke...
3    basic famili littl boy jake think zombi closet...
4    petter mattei love time money visual stun film...
Name: review, dtype: object

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

lb = LabelBinarizer()
df['sentiment'] = lb.fit_transform(df['sentiment'])


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['sentiment'] = lb.fit_transform(df['sentiment'])


In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer


tfidf_vectorizer = TfidfVectorizer()

X_tfidf = tfidf_vectorizer.fit_transform(df['review'])

print("Shape of TF-IDF Matrix:", X_tfidf.shape)


Shape of TF-IDF Matrix: (9983, 49769)


In [31]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X_tfidf, df['sentiment'], test_size=0.3, random_state=42)

print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("Y_train shape:", Y_train.shape)
print("Y_test shape:", Y_test.shape)



X_train shape: (6988, 49769)
X_test shape: (2995, 49769)
Y_train shape: (6988,)
Y_test shape: (2995,)


In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Instantiate the model
model = LogisticRegression()

# Train the model
model.fit(X_train, Y_train)

# Predict the test set results
Y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(Y_test, Y_pred)
print("Accuracy Score:", accuracy)

# Check further with a confusion matrix
conf_matrix = confusion_matrix(Y_test, Y_pred)
print("Confusion Matrix:\n", conf_matrix)

# Print classification report for further evaluation
class_report = classification_report(Y_test, Y_pred)
print("Classification Report:\n", class_report)


Accuracy Score: 0.8754590984974958
Confusion Matrix:
 [[1271  217]
 [ 156 1351]]
Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.85      0.87      1488
           1       0.86      0.90      0.88      1507

    accuracy                           0.88      2995
   macro avg       0.88      0.88      0.88      2995
weighted avg       0.88      0.88      0.88      2995



In [35]:
new_reviews = ["I loved this movie!", "This movie was a bad comedy movie!"]

for review in new_reviews:
    review = simple_preprocessing(review)
    review = stemming(review)
    new_reviews_tfidf = tfidf_vectorizer.transform(new_reviews)


new_reviews_pred = model.predict(new_reviews_tfidf)

print("Predictions for the new reviews:")
for review, sentiment in zip(new_reviews, new_reviews_pred):
    print(f"Review: {review} - Sentiment: {'Positive' if sentiment == 1 else 'Negative'}")


Predictions for the new reviews:
Review: I loved this movie! - Sentiment: Positive
Review: This movie was a bad comedy movie! - Sentiment: Negative


In [36]:
complex_reviews = [
    "I thought the movie was quite enjoyable, although some scenes were unnecessarily prolonged.",
    "The plot was innovative and captivating, but the character development lacked depth.",
    "Despite the brilliant cinematography, the storyline was predictable and dull.",
    "The film's special effects were top-notch, yet the dialogue was stilted and awkward.",
    "It was an absolute masterpiece with perfect pacing, though it might not appeal to everyone."
]

for review in complex_reviews:
    review_cleaned = simple_preprocessing(review)
    review_stemmed = stemming(review_cleaned)
    review_tfidf = tfidf_vectorizer.transform([review_stemmed])
    review_pred = model.predict(review_tfidf)
    print(f"Review: {review} - Sentiment: {'Positive' if review_pred[0] == 1 else 'Negative'}")

Review: I thought the movie was quite enjoyable, although some scenes were unnecessarily prolonged. - Sentiment: Positive
Review: The plot was innovative and captivating, but the character development lacked depth. - Sentiment: Negative
Review: Despite the brilliant cinematography, the storyline was predictable and dull. - Sentiment: Negative
Review: The film's special effects were top-notch, yet the dialogue was stilted and awkward. - Sentiment: Negative
Review: It was an absolute masterpiece with perfect pacing, though it might not appeal to everyone. - Sentiment: Positive


In [None]:
# Suggestions to improve accuracy:
# 1. Consider using a more complex model such as SVM or RandomForest which might capture nuances better.
# 2. Increase the dataset size for training to help the model learn more diverse examples.
# 3. Utilize more advanced text preprocessing techniques like lemmatization and POS tagging.
# 4. Experiment with different parameters for the vectorizer and the model.
# 5. Use ensemble methods to combine predictions from multiple models.
