<a href="https://colab.research.google.com/github/Thejus-1503/Sentiment-Analysis-With-NLP/blob/main/Sentiment_Analysis_With_NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Importing the required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [2]:
# Reading the csv file and putting it into 'df' object.
df = pd.read_csv('/content/drive/MyDrive/IMDB_Dataset.csv')

In [3]:
# Let's understand the type of values in each column of our dataframe 'df'.
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17500 entries, 0 to 17499
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   review     17500 non-null  object
 1   sentiment  17500 non-null  object
dtypes: object(2)
memory usage: 273.6+ KB


In [4]:
# Let's understand the data, how it look like.
df.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,positive
1,Not many television shows appeal to quite as m...,positive
2,The film quickly gets to a major chase scene w...,negative
3,Jane Austen would definitely approve of this o...,positive
4,Expectations were somewhat high for me when I ...,negative


In [5]:
# Let's check whether data set consist of any missing values.
print(df.isnull().sum())

review       0
sentiment    0
dtype: int64


In [6]:
# Convert sentiment labels from text to binary values: positive -> 1, negative -> 0
df.sentiment=df.sentiment.map({"positive":1,"negative":0})
df.head()

Unnamed: 0,review,sentiment
0,I really liked this Summerslam due to the look...,1
1,Not many television shows appeal to quite as m...,1
2,The film quickly gets to a major chase scene w...,0
3,Jane Austen would definitely approve of this o...,1
4,Expectations were somewhat high for me when I ...,0


In [7]:
# Lowercasing the text
df['review'] = df['review'].str.lower()
df.head()

Unnamed: 0,review,sentiment
0,i really liked this summerslam due to the look...,1
1,not many television shows appeal to quite as m...,1
2,the film quickly gets to a major chase scene w...,0
3,jane austen would definitely approve of this o...,1
4,expectations were somewhat high for me when i ...,0


In [8]:
# Remove HTML tags and remove non-alphanumeric characters except spaces
import re

def remove_html_tags(text):
    clean = re.compile('<.*?>')
    text = re.sub(clean, '', text)
    text = re.sub(r"[^\w\s]", " ", text)

    return text

# Apply the function to the DataFrame
df['review'] = df['review'].apply(remove_html_tags)
df.head()

Unnamed: 0,review,sentiment
0,i really liked this summerslam due to the look...,1
1,not many television shows appeal to quite as m...,1
2,the film quickly gets to a major chase scene w...,0
3,jane austen would definitely approve of this o...,1
4,expectations were somewhat high for me when i ...,0


In [9]:
# Remove white spaces
def remove_whitespace(text):
    return  " ".join(text.split())
df['review']=df['review'].apply(remove_whitespace)
df.head()

Unnamed: 0,review,sentiment
0,i really liked this summerslam due to the look...,1
1,not many television shows appeal to quite as m...,1
2,the film quickly gets to a major chase scene w...,0
3,jane austen would definitely approve of this o...,1
4,expectations were somewhat high for me when i ...,0


In [10]:
# Remove stop words
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
def remove_stopwords(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stop_words]
    return " ".join(filtered_words)
df['review'] = df['review'].apply(remove_stopwords)
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,review,sentiment
0,really liked summerslam due look arena curtain...,1
1,many television shows appeal quite many differ...,1
2,film quickly gets major chase scene ever incre...,0
3,jane austen would definitely approve one gwyne...,1
4,expectations somewhat high went see movie thou...,0


In [11]:
# To check is there any GPU available or not
import torch
is_cuda = torch.cuda.is_available()
# If we have a GPU available, we'll set our device to GPU.
if is_cuda:
    device = torch.device("cuda")
    print("GPU is available")
else:
    device = torch.device("cpu")
    print("GPU not available, CPU used")

GPU not available, CPU used


In [12]:
# Importing train-test-split
from sklearn.model_selection import train_test_split

In [13]:
# Putting review to X
X = df['review']

# Putting sentiment to y
y = df['sentiment']

In [14]:
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.20,random_state = 42)

In [18]:
# TF-IDF Vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

# Example: assuming X_train contains your training review text
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
# Transform the test set using the fitted vectorizer
X_test_tfidf = tfidf.transform(X_test)

In [16]:
# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [19]:
# Make predictions
y_pred = model.predict(X_test_tfidf)

In [20]:
# Check accuracy
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)


Model Accuracy: 0.8722857142857143


In [21]:
# Print classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.85      0.87      1710
           1       0.86      0.89      0.88      1790

    accuracy                           0.87      3500
   macro avg       0.87      0.87      0.87      3500
weighted avg       0.87      0.87      0.87      3500



In [22]:
# Print confusion matrix
from sklearn.metrics import confusion_matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(conf_matrix)


Confusion Matrix:
[[1453  257]
 [ 190 1600]]


In [25]:
# Test the model
sample_review = ["This product is amazing! I love it."]
sample_review_tfidf = tfidf.transform(sample_review)
prediction = model.predict(sample_review_tfidf)
print("\nSample Review Prediction:", "Positive" if prediction[0] == 1 else "Negative")


Sample Review Prediction: Positive
