<a href="https://colab.research.google.com/github/Varshitha-55/Sentiment-Analysis-using-NLP/blob/main/SENTIMENT_ANALYSIS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.layers import SimpleRNN, LSTM, Dense, Dropout, Embedding, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

import warnings
warnings.filterwarnings('ignore')

In [None]:
data = pd.read_csv('sentimentdataset.csv')

# Printing shape of the dataset
print(data.shape)
# printing columns and rows information
print(data.info())

(732, 2)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 732 entries, 0 to 731
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Text       732 non-null    object
 1   Sentiment  732 non-null    object
dtypes: object(2)
memory usage: 11.6+ KB
None


In [None]:
# looking for NULL values
print("Null Values:\n", data.isna().sum())

# dropping null values
data = data.dropna()

# again checking for NULL values
print("Null Values after dropping:\n", data.isna().sum())

Null Values:
 Text         0
Sentiment    0
dtype: int64
Null Values after dropping:
 Text         0
Sentiment    0
dtype: int64


In [None]:
# count of unique values in Sentiment column
data['Sentiment'].value_counts()

Unnamed: 0_level_0,count
Sentiment,Unnamed: 1_level_1
Positive,44
Joy,42
Excitement,32
Happy,14
Neutral,14
...,...
Vibrancy,1
Culinary Adventure,1
Mesmerizing,1
Thrilling Journey,1


In [None]:
# Check the original category distribution
print("Original Category Distribution:")
print(data['Sentiment'].value_counts())

# Get the largest category size (i.e., the category with the maximum number of entries)
max_size = data['Sentiment'].value_counts().max()

# Perform oversampling
balanced_df = data.groupby('Sentiment').apply(lambda x: x.sample(max_size, replace=True)).reset_index(drop=True)

# Shuffle the dataset to avoid any order bias
data = balanced_df.sample(frac=1).reset_index(drop=True)

# Check the balanced category distribution
print("\nBalanced Category Distribution (After Oversampling):")
print(data['Sentiment'].value_counts())

Original Category Distribution:
Sentiment
Positive               44
Joy                    42
Excitement             32
Happy                  14
Neutral                14
                       ..
Vibrancy                1
Culinary Adventure      1
Mesmerizing             1
Thrilling Journey       1
Winter Magic            1
Name: count, Length: 279, dtype: int64

Balanced Category Distribution (After Oversampling):
Sentiment
Serenity            44
Happy               44
Blessed             44
Intimidation        44
Positivity          44
                    ..
Motivation          44
Indifference        44
Mischievous         44
Joy in Baking       44
Disgust             44
Name: count, Length: 279, dtype: int64


In [None]:
# downloading stopwords from nltk library
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

# downloading punkt sentence tokenizer models
nltk.download('punkt')
# Downloading the 'punkt_tab' data package
nltk.download('punkt_tab') # This line is added to download the required data

from nltk.tokenize import word_tokenize
import re

# Review text Cleaning
def clean_reviews(text):
    if isinstance(text, str):  # Check if input is a string
        # removing html brackets and other square brackets from the string using regex
        regex = re.compile('<.*?>')  # r'<.*?>'
        text = re.sub(regex, '', text)

        # removing special characters like @, #, $, etc
        pattern = re.compile('[^a-zA-z0-9\s]')
        text = re.sub(pattern, '', text)

        # removing numbers
        pattern = re.compile('\d+')
        text = re.sub(pattern, '', text)

        # converting text to lower case
        text = text.lower()

        # Tokenization of words
        text = word_tokenize(text)

        # Stop words removal
        text = [word for word in text if not word in stop_words]
    return text

# using the clean_reviews function on the dataset
data['Text'] = data['Text'].apply(clean_reviews) # This is done in cell 13, avoid redundancy in cell 17

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [None]:
data['Text'][10]

['drifting',
 'day',
 'air',
 'nonchalance',
 'indifferent',
 'trivialities',
 'life']

In [None]:
data['Sentiment'][10]

' Indifference    '

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
le.fit(data['Sentiment'])
data['Sentiment'] = le.transform(data['Sentiment'])

In [None]:
# Train Test split
from sklearn.model_selection import train_test_split

# Assuming 'Text' column contains the features and 'Sentiment' column contains the target
X = data['Text']
y = data['Sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
data.Sentiment.unique()

array([139,  34, 164, 215, 206,  19, 239,  82, 174, 192, 158, 193, 127,
        11,  31, 120, 219,   6, 184,  93, 277, 218, 126, 247,  39, 171,
       121,  30, 199,  76,  25, 187, 112, 195, 267,  89, 142, 249, 255,
       101,  20,  17,  97,  44,  38,  54, 189,  70, 118, 229, 145, 210,
       107,  90, 172, 104, 221, 140, 159, 211, 208, 100,  43,  74,  40,
       259, 264,   8, 170,  78, 136,  71, 163,  96,  42, 115, 256,  47,
       162, 231, 233,  23, 202, 180,  15,  56,  45, 225, 240, 124,  35,
        14, 212, 133,  75, 169, 168, 144, 182,  91, 238, 250, 220,  28,
        80,  53, 258, 128, 213,  52, 207, 103,  68,   4, 268, 131,  57,
        72,   2,  21, 246, 223, 132,  22, 106, 248,  63, 181, 111, 117,
        77, 244,  83,  46, 166, 176,  13, 102,  69, 214,  32, 167, 194,
        29,  59, 198, 262, 116, 271, 205, 216,  36,  67, 186, 130,  33,
       179, 270,  87,   7, 209, 137,  95,  41,  88,  86, 245, 153, 252,
       217, 129,  49, 155, 156, 253, 109, 278, 261, 269,   0, 25

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')

# Joining the list of tokens back into a string before fitting the TfidfVectorizer
data['Text'] = data['Text'].apply(lambda x: ' '.join(x))

tfidf.fit(data['Text'])
requredTaxt  = tfidf.transform(data['Text'])

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(requredTaxt, data['Sentiment'], test_size=0.1, random_state=42)


In [None]:
X_train.shape

(11048, 2306)

In [None]:
X_test.shape

(1228, 2306)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Ensure that X_train and X_test are dense if they are sparse
X_train = X_train.toarray() if hasattr(X_train, 'toarray') else X_train
X_test = X_test.toarray() if hasattr(X_test, 'toarray') else X_test

# 1. Train KNeighborsClassifier
knn_model = OneVsRestClassifier(KNeighborsClassifier())
knn_model.fit(X_train, y_train)
y_pred_knn = knn_model.predict(X_test)
print("\nKNeighborsClassifier Results:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_knn):.4f}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred_knn)}")
print(f"Classification Report:\n{classification_report(y_test, y_pred_knn)}")


KNeighborsClassifier Results:
Accuracy: 0.9837
Confusion Matrix:
[[1 0 0 ... 0 0 0]
 [0 4 0 ... 0 0 0]
 [0 0 6 ... 0 0 0]
 ...
 [0 0 0 ... 6 0 0]
 [0 0 0 ... 0 8 0]
 [0 0 0 ... 0 0 5]]
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         1
           1       1.00      1.00      1.00         4
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00         2
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00         2
           6       1.00      1.00      1.00         3
           7       1.00      1.00      1.00         4
           8       0.86      1.00      0.92         6
           9       1.00      1.00      1.00         7
          10       1.00      1.00      1.00         6
          11       1.00      1.00      1.00         5
          12       1.00      1.00      1.00         6
          13       1.00      1.00 

In [None]:
# Function to predict the category of a resume
def pred(input_resume):
    # Preprocess the input text (e.g., cleaning, etc.)
    cleaned_text = clean_reviews(input_resume)

    # Join the cleaned tokens back into a string
    cleaned_text = ' '.join(cleaned_text) # Join the tokens into a string

    # Vectorize the cleaned text using the same TF-IDF vectorizer used during training
    vectorized_text = tfidf.transform([cleaned_text])

    # Convert sparse matrix to dense
    vectorized_text = vectorized_text.toarray()

    # Prediction using rf_model, change to other model as needed
    predicted_category = knn_model.predict(vectorized_text) # Changed to use rf_model

    # get name of predicted category
    predicted_category_name = le.inverse_transform(predicted_category)

    return predicted_category_name[0]  # Return the category name

In [None]:
text=" Reflecting on the past and looking ahead."
pred(text)

' Intimidation    '

In [None]:
text=" The movie was very good."
pred(text)

' Excitement   '

In [None]:
text=" Enjoying a beautiful day at the park!"
pred(text)

' Love         '

In [None]:
text="Organizing a virtual talent show during challenging times, bringing smiles to classmates' faces! "
pred(text)

' Happy '