In [1]:
import numpy as np
import pandas as pd

In [2]:
#reading the dataset
df = pd.read_csv('imbd_dataset.csv',nrows=2000)
#df=pd.read_csv("imdb_labelled.txt",sep='\t',names=['review','sentiment'])

In [3]:
df.head(5)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
df.shape

(2000, 2)

# Performing the text preprocessing

1) Converting review into lowercase

In [5]:
#lowercasing all the review
df['review']=df['review'].str.lower()

In [6]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. <br /><br />the...,positive


2)Removing the html tag

In [7]:
import re
def remov_html_tags(text):
    """
    Removes HTML tags from the input string.

    Parameters
    ----------
    text : str
        The input string containing HTML tags.

    Returns
    -------
    str
        The input string with HTML tags removed.
    """
    pattern = re.compile('<.*?>')
    return pattern.sub('r',text)

In [8]:
df['review']=df['review'].apply(remov_html_tags)

In [9]:
df.head(2)

Unnamed: 0,review,sentiment
0,one of the other reviewers has mentioned that ...,positive
1,a wonderful little production. rrthe filming t...,positive


3)Removing Punctuation(!,#,$,% etc)

In [10]:
import string 
punction_list=string.punctuation

In [11]:
def remov_punction(text):
    """
    Removes HTML tags from the input string.

    Parameters
    ----------
    text : str
        The input string containing punction.

    Returns
    -------
    str
        The input string without the punction.
    """
    return text.translate(str.maketrans('','',punction_list))

In [12]:
df['review']=df['review'].apply(remov_punction)

4) Performing the word tokenization

In [13]:

from nltk.tokenize import word_tokenize
def word_tokenization(text):
    """
    Tokenize the given sentence

    Parameters
    ----------
    text : str
        The input string containing punction.

    Returns
    -------
    list
        The list of tokenize word for given sentences.
    """
    return word_tokenize(text)

In [14]:
df['review']=df['review'].apply(word_tokenization)

In [15]:
df.head(2)

Unnamed: 0,review,sentiment
0,"[one, of, the, other, reviewers, has, mentione...",positive
1,"[a, wonderful, little, production, rrthe, film...",positive


5) Removing the stops word

In [16]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\shiva
[nltk_data]     gaire\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [17]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [18]:
def remov_stop_words(tokenize_list):
    """
    Removes stopWord from the input string.

    Parameters
    ----------
    tokenize_list : List
        The list of strings containing stopwords.

    Returns
    -------
    List
        The list of string without stopwords.
    """
    new_list=[word for word in tokenize_list if word not in stop_words]
    return new_list

In [19]:
df['review']=df['review'].apply(remov_stop_words)

In [20]:
df.head(5)

Unnamed: 0,review,sentiment
0,"[one, reviewers, mentioned, watching, 1, oz, e...",positive
1,"[wonderful, little, production, rrthe, filming...",positive
2,"[thought, wonderful, way, spend, time, hot, su...",positive
3,"[basically, theres, family, little, boy, jake,...",negative
4,"[petter, matteis, love, time, money, visually,...",positive


6) Steamming the data

In [21]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [22]:
def stemming_word(tokenize_word):
    new_list= [stemmer.stem(word) for word in tokenize_word]
    return new_list

In [23]:
df['review']=df['review'].apply(stemming_word)

In [24]:
df.head(5)

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, y...",positive
1,"[wonder, littl, product, rrthe, film, techniqu...",positive
2,"[thought, wonder, way, spend, time, hot, summe...",positive
3,"[basic, there, famili, littl, boy, jake, think...",negative
4,"[petter, mattei, love, time, money, visual, st...",positive


# Vectorization of data

In [25]:
df['sentiment']=df['sentiment'].replace({'negative': 0, 'positive': 1})
df.head(5)

Unnamed: 0,review,sentiment
0,"[one, review, mention, watch, 1, oz, episod, y...",1
1,"[wonder, littl, product, rrthe, film, techniqu...",1
2,"[thought, wonder, way, spend, time, hot, summe...",1
3,"[basic, there, famili, littl, boy, jake, think...",0
4,"[petter, mattei, love, time, money, visual, st...",1


In [26]:
from sklearn.model_selection import train_test_split
X=df['review']
y=df['sentiment']


In [27]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,shuffle=True,random_state=666)

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1,1),lowercase=False)

In [29]:
X_train=[' '.join(tokens) for tokens in X_train]

In [30]:
X_test=[' '.join(tokens) for tokens in X_test]

In [31]:
X_train_vector= vectorizer.fit_transform(X_train)
X_test_vector=vectorizer.transform(X_test)

In [32]:
vectorizer.get_feature_names_out()

array(['007', '02', '0510', ..., 'êtrepeutêtr', 'ís', 'ísnt'],
      dtype=object)

# Building a model

In [33]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

In [34]:
rand_forest=RandomForestClassifier(n_jobs=-1,random_state=666)
svm= SVC(random_state=666)
logistic=LogisticRegression(n_jobs=-1,random_state=666)

In [35]:
rand_forest.fit(X_train_vector,y_train)

In [36]:
logistic.fit(X_train_vector,y_train)

In [37]:
svm.fit(X_train_vector,y_train)

# Model performance Evaluation

In [38]:
from sklearn.metrics import classification_report

For RandomForest

In [39]:
y_pred=rand_forest.predict(X_test_vector)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.78      0.85      0.82       193
           1       0.85      0.78      0.81       207

    accuracy                           0.81       400
   macro avg       0.82      0.82      0.81       400
weighted avg       0.82      0.81      0.81       400



For SVC

In [40]:
y_pred=svm.predict(X_test_vector)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.85      0.85      0.85       193
           1       0.86      0.86      0.86       207

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.86      0.86      0.86       400



For logistic Regression

In [41]:
y_pred=logistic.predict(X_test_vector)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.84      0.88      0.86       193
           1       0.88      0.85      0.86       207

    accuracy                           0.86       400
   macro avg       0.86      0.86      0.86       400
weighted avg       0.86      0.86      0.86       400



Testing in new data

In [81]:
new_data=[
    'This is very well action movie.',
]

In [82]:
vectorizer_test=TfidfVectorizer(tokenizer=nltk.word_tokenize,
                                stop_words='english',
                                lowercase=True,
                                ngram_range=(1,1)
                               )

In [83]:
test_review_vec=vectorizer.transform(new_data)

In [87]:
predicted_sentiments=logistic.predict(test_review_vec)

In [88]:
# Extract important words for positive and negative sentiments
positive_words = []
negative_words = []

# Iterate over each test review and extract important words based on the predicted sentiment
for i, review in enumerate(new_data):
    if predicted_sentiments[i] == 0:  # Negative sentiment
        important_indices = X_test[i].nonzero()[1]  # Get indices of non-zero TF-IDF values
        for idx in important_indices:
            feature_name = feature_names[idx]
            if model.coef_[0][idx] < 0:  # If coefficient is negative, it's important for negative sentiment
                negative_words.append(feature_name)
    else:  # Positive sentiment
        important_indices = X_test[i].nonzero()[1]  # Get indices of non-zero TF-IDF values
        for idx in important_indices:
            feature_name = feature_names[idx]
            if model.coef_[0][idx] > 0:  # If coefficient is positive, it's important for positive sentiment
                positive_words.append(feature_name)

# Display important words for positive and negative sentiments
print("Important Words for Negative Sentiment:", negative_words)
print("Important Words for Positive Sentiment:", positive_words)

AttributeError: 'str' object has no attribute 'nonzero'