In [1]:
# In this program we Analyzed the Sentiment dataset using Natural Language Processing

import pandas as pd

In [2]:
df_sentiment = pd.read_csv('imdb_labelled.txt', sep='\t', names=['comment', 'label'])

In [3]:
# Comment is a feature which is the text present in the dataset and label is the response or category of the comment

In [4]:
# View first 10 observations of the IMDB comment collection.
# 1 indicates positive sentiment and 0 indicates negative sentiment.
df_sentiment.head(10)

Unnamed: 0,comment,label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1
5,"The rest of the movie lacks art, charm, meanin...",0
6,Wasted two hours.,0
7,Saw the movie today and thought it was a good ...,1
8,A bit predictable.,0
9,Loved the casting of Jimmy Buffet as the scien...,1


In [5]:
# View more information about the sentiment data using describe method
df_sentiment.describe()

Unnamed: 0,label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [8]:
# View more info on the data
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   comment  748 non-null    object
 1   label    748 non-null    int64 
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [6]:
# View the IMDB collection according to responses using group by and describe method
df_sentiment.groupby('label').describe()

Unnamed: 0_level_0,comment,comment,comment,comment
Unnamed: 0_level_1,count,unique,top,freq
label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,10/10,2


In [7]:
# Verify length of the messages and also add it as a new column (feature)
df_sentiment['length'] = df_sentiment['comment'].apply(len)

In [8]:
# View first 5 messages with length
df_sentiment.head()

Unnamed: 0,comment,label,length
0,"A very, very, very slow-moving, aimless movie ...",0,87
1,Not sure who was more lost - the flat characte...,0,99
2,Attempting artiness with black & white and cle...,0,188
3,Very little music or anything to speak of.,0,44
4,The best scene in the movie was when Gerardo i...,1,108


In [9]:
# View the first comment which has linked greater than 50, and index position 0
df_sentiment[df_sentiment['length']>50]['comment'].iloc[0]

'A very, very, very slow-moving, aimless movie about a distressed, drifting young man.  '

In [10]:
# Start text processing with vectorizer. Use the count vectorizer class to convert the collection of text documents to a 
# matrix of tokens. This assigns a numerical value to each word present in the text

from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

In [12]:
# Define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    # Check characters to see if there are punctuations
    no_punctuation = [char for char in mess if char not in string.punctuation]
    # now form the sentence
    no_punctuation = ''.join(no_punctuation)
    # now eliminate any stopwords
    return [word for word in no_punctuation.split() if word.lower not in stopwords.words('english')]

In [13]:
# Verify that the function is working by applying the function and fit the data (comment) into it. Use the technique
# bag of words
import string
from nltk.corpus import stopwords
bag_of_words = CountVectorizer(analyzer=message_text_process).fit(df_sentiment['comment'])

In [14]:
# Apply transform method for the bag of words
comment_bagofwords = bag_of_words.transform(df_sentiment['comment'])

In [15]:
# Appl tfidf transformer and fit the bag of words into it (transformed version)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer().fit(comment_bagofwords)

In [16]:
# Print shape of the tfidf
comment_tfidf = tfidf_transformer.transform(comment_bagofwords)
print(comment_tfidf.shape)

(748, 3458)


In [17]:
# Choose Naive Bayes model to detect sentiment and fit the tfidf data into it
from sklearn.naive_bayes import MultinomialNB
sentiment_detection_model = MultinomialNB().fit(comment_tfidf, df_sentiment['label'])

In [22]:
# Check model for the predicted and expected value say for comment #1 and comment #5 (index 0 & index 4)
comment = df_sentiment['comment'][0]

# transform the comment as bag of words
bag_of_words_for_comment = bag_of_words.transform([comment])

# apply TFIDF transformation
tfidf = tfidf_transformer.transform(bag_of_words_for_comment)

# Predict the response using the predict method of the model and check the actual value present for the first comment in 
# the data set
print('predicted sentiment label ', sentiment_detection_model.predict(tfidf)[0])
print('expected sentiment label ', df_sentiment.label[4])

predicted sentiment label  0
expected sentiment label  0


In [23]:
# The predicted values match the actual value. That proves that the text processing algorithm and model are working properly.