In [123]:
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split

import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\santa_000\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [124]:
df_sentiment=pd.read_csv('https://raw.githubusercontent.com/Santanukolkata/Data_Science/master/datasets/imdb_labelled.tsv',sep='::',names=['Comment','Label'],engine='python')

In [125]:
df_sentiment.head()

Unnamed: 0,Comment,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [126]:
df_sentiment.describe()

Unnamed: 0,Label
count,1000.0
mean,0.5
std,0.50025
min,0.0
25%,0.0
50%,0.5
75%,1.0
max,1.0


In [127]:
df_sentiment.shape

(1000, 2)

In [128]:
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Comment  1000 non-null   object
 1   Label    1000 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 15.8+ KB


In [129]:
df_sentiment.groupby('Label').describe()

Unnamed: 0_level_0,Comment,Comment,Comment,Comment
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,500,499,Not recommended.,2
1,500,498,Definitely worth checking out.,2


In [130]:
vectorizer=CountVectorizer()

In [131]:
#define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    no_punchuation=[char for char in mess if char not in string.punctuation]
    #print(no_punchuation)
    no_punchuation=''.join(no_punchuation)
    #print(no_punchuation)
    return [word for word in no_punchuation.split() if word.lower() not in stopwords.words('english')]
df_sentiment['Comment'].head(1).apply(message_text_process)

0    [slowmoving, aimless, movie, distressed, drift...
Name: Comment, dtype: object

In [132]:

X_train,Y_train,x_label,y_label=train_test_split(df_sentiment['Comment'],df_sentiment['Label'],test_size=.2,random_state=42)

In [133]:
X_train.shape,Y_train.shape,x_label.shape,y_label.shape

((800,), (200,), (800,), (200,))

In [134]:
bagofwords_all=CountVectorizer(analyzer=message_text_process).fit(df_sentiment['Comment'].values)
Comment_bagofwords_train=bagofwords_all.transform(X_train.values)
bagofwords_df=pd.DataFrame(Comment_bagofwords_train.toarray(),columns =bagofwords_all.get_feature_names())
print(bagofwords_df.shape)
tfidf_transformer=TfidfTransformer().fit(Comment_bagofwords_train)
comment_tfidf_train=tfidf_transformer.transform(Comment_bagofwords_train)
print(comment_tfidf_train.shape)

(800, 3259)
(800, 3259)


In [135]:
#Training the models
spam_detection_model=MultinomialNB().fit(comment_tfidf_train,x_label)

In [136]:
Comment_bagofwords_test=bagofwords_all.transform(Y_train.values)
bagofwords_df_test=pd.DataFrame(Comment_bagofwords_test.toarray(),columns =bagofwords_all.get_feature_names())
print(bagofwords_df_test.shape)
tfidf_transformer_test=TfidfTransformer().fit(Comment_bagofwords_test)
tfidf_transformer_test=tfidf_transformer_test.transform(Comment_bagofwords_test)
print(tfidf_transformer_test.shape)

(200, 3259)
(200, 3259)


In [137]:
y_predict=spam_detection_model.predict(tfidf_transformer_test)

In [138]:
confusion_matrix(y_label, y_predict)

array([[74, 17],
       [29, 80]], dtype=int64)

In [139]:
spam_detection_model.predict_proba(tfidf_transformer_test)

array([[0.61035214, 0.38964786],
       [0.72639218, 0.27360782],
       [0.43276864, 0.56723136],
       [0.51335736, 0.48664264],
       [0.21852142, 0.78147858],
       [0.19437155, 0.80562845],
       [0.85085978, 0.14914022],
       [0.52175829, 0.47824171],
       [0.24736772, 0.75263228],
       [0.63516133, 0.36483867],
       [0.4432048 , 0.5567952 ],
       [0.53273438, 0.46726562],
       [0.42842176, 0.57157824],
       [0.50067707, 0.49932293],
       [0.57651597, 0.42348403],
       [0.46902673, 0.53097327],
       [0.3723554 , 0.6276446 ],
       [0.5423995 , 0.4576005 ],
       [0.38404221, 0.61595779],
       [0.38666107, 0.61333893],
       [0.38445292, 0.61554708],
       [0.52401536, 0.47598464],
       [0.70253949, 0.29746051],
       [0.62060671, 0.37939329],
       [0.3029063 , 0.6970937 ],
       [0.69600193, 0.30399807],
       [0.6337699 , 0.3662301 ],
       [0.5143077 , 0.4856923 ],
       [0.73268879, 0.26731121],
       [0.26580388, 0.73419612],
       [0.

In [140]:
print('Testing Accuracy on Training Set:',spam_detection_model.score(Comment_bagofwords_train,x_label))
print('Testing Accuracy on Test Set:',spam_detection_model.score(Comment_bagofwords_test,y_label))

Testing Accuracy on Training Set: 0.96875
Testing Accuracy on Test Set: 0.755


In [141]:
Trainset =pd.DataFrame()

In [142]:
Trainset['Labels']=(y_label)
Trainset['comments']=(Y_train)
Trainset['Predicted']=(y_predict)

In [143]:
y_predict

array([0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
       0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1,
       0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0,
       1, 1], dtype=int64)

In [144]:
type(Train_set)

pandas.core.series.Series

In [145]:
Trainset[Trainset['Predicted']==1].shape

(97, 3)

In [146]:
Trainset

Unnamed: 0,Labels,comments,Predicted
521,0,Watching washing machine twirling around would...,0
737,0,The movie has almost no action scenes in it an...,0
740,1,I just saw this film and I recommend it.,1
660,1,This is a witty and delightful adaptation of t...,0
411,1,Her role was played well.,1
...,...,...,...
408,1,It really created a unique feeling though.,0
332,0,To call this movie a drama is ridiculous!,0
208,0,Another thing I didn't really like is when a c...,0
613,1,Go watch it!,1


In [147]:
Trainset[(Trainset['Predicted']==1) & (Trainset['Labels']==0)].shape

(17, 3)

In [148]:
#False positive
FP=Trainset[(Trainset['Predicted']==1) & (Trainset['Labels']==0)]
FP['Vader_score']=FP['comments'].apply(lambda review: sid.polarity_scores(review))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [149]:
FP

Unnamed: 0,Labels,comments,Predicted,Vader_score
280,0,"It was forced, like everything in this movie.",1,"{'neg': 0.261, 'neu': 0.522, 'pos': 0.217, 'co..."
601,0,There is simply no excuse for something this p...,1,"{'neg': 0.191, 'neu': 0.696, 'pos': 0.113, 'co..."
221,0,There are the usual Hitchcock logic flaws.,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
70,0,"And, quite honestly, often its not very good.",1,"{'neg': 0.22, 'neu': 0.504, 'pos': 0.276, 'com..."
107,0,"I love Lane, but I've never seen her in a movi...",1,"{'neg': 0.329, 'neu': 0.521, 'pos': 0.15, 'com..."
218,0,It's this pandering to the audience that sabot...,1,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound..."
534,0,but the movie makes a lot of serious mistakes.,1,"{'neg': 0.439, 'neu': 0.561, 'pos': 0.0, 'comp..."
583,0,It failed to convey the broad sweep of landsca...,1,"{'neg': 0.145, 'neu': 0.573, 'pos': 0.282, 'co..."
800,0,"In fact, this stinker smells like a direct-to-...",1,"{'neg': 0.253, 'neu': 0.527, 'pos': 0.22, 'com..."
55,0,But I recommend waiting for their future effor...,1,"{'neg': 0.0, 'neu': 0.8, 'pos': 0.2, 'compound..."


In [150]:
FN=Trainset[(Trainset['Predicted']==0) & (Trainset['Labels']==1)].copy()

In [151]:
FN['Vader_score']=FN['comments'].apply(lambda review: sid.polarity_scores(review))

In [152]:
FN['compound']  = FN['Vader_score'].apply(lambda score_dict: score_dict['compound'])

In [153]:
FN

Unnamed: 0,Labels,comments,Predicted,Vader_score,compound
660,1,This is a witty and delightful adaptation of t...,0,"{'neg': 0.0, 'neu': 0.639, 'pos': 0.361, 'comp...",0.9062
76,1,I especially liked the non-cliche choices with...,0,"{'neg': 0.0, 'neu': 0.796, 'pos': 0.204, 'comp...",0.7092
938,1,This movie is great--especially if you enjoy v...,0,"{'neg': 0.0, 'neu': 0.714, 'pos': 0.286, 'comp...",0.4939
986,1,;) Recommend with confidence!,0,"{'neg': 0.0, 'neu': 0.111, 'pos': 0.889, 'comp...",0.7901
621,1,"A mature, subtle script that suggests and occa...",0,"{'neg': 0.088, 'neu': 0.707, 'pos': 0.205, 'co...",0.765
499,1,Later I found myself lost in the power of the ...,0,"{'neg': 0.204, 'neu': 0.796, 'pos': 0.0, 'comp...",-0.3182
261,1,***SPOILERS*** Whatever else can (or can't) be...,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
296,1,Three of the most visually appealing movies i'...,0,"{'neg': 0.0, 'neu': 1.0, 'pos': 0.0, 'compound...",0.0
542,1,"While you don't yet hear Mickey speak, there a...",0,"{'neg': 0.0, 'neu': 0.778, 'pos': 0.222, 'comp...",0.7906
662,1,"However Paul Schrader has indeed made a film ""...",0,"{'neg': 0.0, 'neu': 0.76, 'pos': 0.24, 'compou...",0.6249
