<a href="https://colab.research.google.com/github/Sharyupatil01/Sentiment-Analysis-Movie-Review/blob/main/Sentiment_Analysis_of_Movie_Reviews.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
#Imported all the required Python libraries
import pandas as pd
import re
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Loading of csv file

file_path="/content/sample_data/IMDB Dataset.csv"

# try except block is a good practice to handle potential errors

try:
  data=pd.read_csv(file_path)
  print("Dataset loaded successfully")
  print("\n First 5 row of the dataset")
  print(data.head())
  print("\nInformation about the dataset")
  data.info()
except FileNotFoundError:
  print(f"File not found at path {file_path} was not found . Please make sure its in the same directory")
  exit()

Dataset loaded successfully

 First 5 row of the dataset
                                              review sentiment
0  One of the other reviewers has mentioned that ...  positive
1  A wonderful little production. <br /><br />The...  positive
2  I thought this was a wonderful way to spend ti...  positive
3  Basically there's a family where a little boy ...  negative
4  Petter Mattei's "Love in the Time of Money" is...  positive

Information about the dataset
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     50000 non-null  object
 1   sentiment  50000 non-null  object
dtypes: object(2)
memory usage: 781.4+ KB


In [3]:
#Verifying that dataset contains the columns we need
if 'review' in data.columns and 'sentiment' in data.columns:
  print("\n The dataset is suitable as it contains 'review' and 'sentiment' columns ")

  #Preprocessing of the text data
  print("\n Preprocessing of the text data")
  # Defining a list of common English "stop words" and a function to clean the text.
  stop_words=set(
      [
         'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd",
        'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers',
        'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
        'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been',
        'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but',
        'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between',
        'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on',
        'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how',
        'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own',
        'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've",
        'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't",
        'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn',
        "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't",
        'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"
      ]
  )
  def preprocess_text(text):
    """
    #this function cleans the text data by :
    1. Ensuring the input is a string and converting it to lowercase .
    2. Removing html tags (<br/>) that are in raw data.
    3. Removing punctutation (like '!' , '?' etc).
    4. Removing common English "stop words" from our defined list .

    """

    text=str(text).lower()
    text = re.sub(r'<br />',' ',text)
    text= ''.join([char for char in text if char not in string.punctuation])
    text= ' '.join([word for word in text.split() if word not in stop_words])
    return text

    # we apply this cleaning function to every review in the 'review' column.
  data['cleaned_review']=data['review'].apply(preprocess_text)

  print("\n First 5 rows of the cleaned_review columns:")

  print(data[['review','cleaned_review']].head())



 The dataset is suitable as it contains 'review' and 'sentiment' columns 

 Preprocessing of the text data

 First 5 rows of the cleaned_review columns:
                                              review                                     cleaned_review
0  One of the other reviewers has mentioned that ...  one reviewers mentioned watching 1 oz episode ...
1  A wonderful little production. <br /><br />The...  wonderful little production filming technique ...
2  I thought this was a wonderful way to spend ti...  thought wonderful way spend time hot summer we...
3  Basically there's a family where a little boy ...  basically theres family little boy jake thinks...
4  Petter Mattei's "Love in the Time of Money" is...  petter matteis love time money visually stunni...


In [4]:
#Converting TEXT to Numbers

#We are well aware that computer can't process the text , only numbers
#We use TF-IDF (Term Frequency-Inverse Document Frequency) to convert each review
# into a numberical vector , where each number represents the importance of a word .

print("\n Converting text to numerical data using TF-IDF")

vectorizer=TfidfVectorizer()
X= vectorizer.fit_transform(data['cleaned_review'])
Y=data['sentiment'] #our target labels (positive/negative)

print(f"Shape of the numerical data (TF-IDF matrix): {X.shape}")



 Converting text to numerical data using TF-IDF
Shape of the numerical data (TF-IDF matrix): (50000, 166536)


In [5]:
#Train a MACHINE LEARNING MODEL
# we split the data into a training ser (80%) and testing set (20%)
#the model learns from the training data and then we test its performance on the unseen data.
# Logistic Regression is a simple and effective model

print("\n Splitting the data into training and testing sets")
X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.2,random_state=42)
model=LogisticRegression(max_iter=1000)
model.fit(X_train,Y_train)
print("Model training is completed")




 Splitting the data into training and testing sets
Model training is completed


In [6]:
#Evaluate the model
#after training , we evaluate our model's performance on text data
#the accuracy _ score tells us what precentage of the test reviews our model
# predicated right

print("\n Evaluating the model's performances")
y_pred=model.predict(X_test)
accuracy=accuracy_score(Y_test,y_pred)
print(f"Model Accuracy on the test set :{accuracy:.2f}")

print("\n Classification Report:")
print(classification_report(Y_test,y_pred))




 Evaluating the model's performances
Model Accuracy on the test set :0.90

 Classification Report:
              precision    recall  f1-score   support

    negative       0.91      0.88      0.90      4961
    positive       0.89      0.91      0.90      5039

    accuracy                           0.90     10000
   macro avg       0.90      0.90      0.90     10000
weighted avg       0.90      0.90      0.90     10000



In [8]:
# This is the final and most exciting step! We test the model with a new review it has never seen.
print("\n Testing the model on a new, unseen review...")
new_review = ["The movie was a masterpiece, I absolutely loved every moment of it.Highly Recommended"]

#we must apply the same cleaning and TF-IDF conversion steps to new review
cleaned_new_review=preprocess_text(new_review[0])
new_review_vectorized=vectorizer.transform([cleaned_new_review])
predication=model.predict(new_review_vectorized)
print(f"The predicted sentiment for the new review is: {predication[0]}")





 Testing the model on a new, unseen review...
The predicted sentiment for the new review is: positive
