In [60]:
import pandas as pd
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfTransformer,CountVectorizer
from sklearn.metrics import confusion_matrix

In [33]:
df_sentiment=pd.read_csv('imdb_labelled.txt',sep='\t',names=['Comment','Label'])

In [34]:
df_sentiment.head()

Unnamed: 0,Comment,Label
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [35]:
df_sentiment.describe()

Unnamed: 0,Label
count,748.0
mean,0.516043
std,0.500077
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [36]:
df_sentiment.shape

(748, 2)

In [37]:
df_sentiment.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 748 entries, 0 to 747
Data columns (total 2 columns):
Comment    748 non-null object
Label      748 non-null int64
dtypes: int64(1), object(1)
memory usage: 11.8+ KB


In [38]:
df_sentiment.groupby('Label').describe()

Unnamed: 0_level_0,Comment,Comment,Comment,Comment
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
0,362,361,Not recommended.,2
1,386,384,10/10,2


In [39]:
vectorizer=CountVectorizer()

In [40]:
#define a function to get rid of stopwords present in the messages
def message_text_process(mess):
    no_punchuation=[char for char in mess if char not in string.punctuation]
    #print(no_punchuation)
    no_punchuation=''.join(no_punchuation)
    #print(no_punchuation)
    return [word for word in no_punchuation.split() if word.lower() not in stopwords.words('english')]
df_sentiment['Comment'].head(1).apply(message_text_process)

0    [slowmoving, aimless, movie, distressed, drift...
Name: Comment, dtype: object

In [53]:
from sklearn.model_selection import train_test_split
X_train,Y_train,x_label,y_label=train_test_split(df_sentiment['Comment'],df_sentiment['Label'],test_size=.3,random_state=42)

In [54]:
X_train.shape,Y_train.shape,x_label.shape,y_label.shape

((598,), (150,), (598,), (150,))

In [51]:
bagofwords_all=CountVectorizer(analyzer=message_text_process).fit(df_sentiment['Comment'].values)
Comment_bagofwords_train=bagofwords_all.transform(X_train.values)
bagofwords_df=pd.DataFrame(Comment_bagofwords_train.toarray(),columns =bagofwords_all.get_feature_names())
print(bagofwords_df.shape)
tfidf_transformer=TfidfTransformer().fit(Comment_bagofwords_train)
comment_tfidf_train=tfidf_transformer.transform(Comment_bagofwords_train)
print(comment_tfidf_train.shape)

(598, 3257)
(598, 3257)


In [55]:
#Training the models
spam_detection_model=MultinomialNB().fit(comment_tfidf_train,x_label)

In [56]:
Comment_bagofwords_test=bagofwords_all.transform(Y_train.values)
bagofwords_df_test=pd.DataFrame(Comment_bagofwords_test.toarray(),columns =bagofwords_all.get_feature_names())
print(bagofwords_df_test.shape)
tfidf_transformer_test=TfidfTransformer().fit(Comment_bagofwords_test)
tfidf_transformer_test=tfidf_transformer_test.transform(Comment_bagofwords_test)
print(tfidf_transformer_test.shape)

(150, 3257)
(150, 3257)


In [58]:
y_predict=spam_detection_model.predict(tfidf_transformer_test)

In [61]:
confusion_matrix(y_label, y_predict)

array([[52, 24],
       [12, 62]], dtype=int64)

In [64]:
spam_detection_model.predict_proba(tfidf_transformer_test)

array([[0.36330928, 0.63669072],
       [0.510325  , 0.489675  ],
       [0.56268892, 0.43731108],
       [0.55307642, 0.44692358],
       [0.31826974, 0.68173026],
       [0.66749108, 0.33250892],
       [0.28433354, 0.71566646],
       [0.55295919, 0.44704081],
       [0.65378984, 0.34621016],
       [0.48986848, 0.51013152],
       [0.43676572, 0.56323428],
       [0.38875399, 0.61124601],
       [0.44001346, 0.55998654],
       [0.51614057, 0.48385943],
       [0.30574684, 0.69425316],
       [0.5215268 , 0.4784732 ],
       [0.57333348, 0.42666652],
       [0.61460107, 0.38539893],
       [0.56140134, 0.43859866],
       [0.51723719, 0.48276281],
       [0.4421978 , 0.5578022 ],
       [0.4962618 , 0.5037382 ],
       [0.63449048, 0.36550952],
       [0.66806366, 0.33193634],
       [0.39888236, 0.60111764],
       [0.36066495, 0.63933505],
       [0.41577699, 0.58422301],
       [0.6311325 , 0.3688675 ],
       [0.61983299, 0.38016701],
       [0.75996013, 0.24003987],
       [0.

In [75]:
print('Testing Accuracy on Training Set:',spam_detection_model.score(Comment_bagofwords_train,x_label))
print('Testing Accuracy on Test Set:',spam_detection_model.score(Comment_bagofwords_test,y_label))

Testing Accuracy on Training Set: 0.9765886287625418
Testing Accuracy on Test Set: 0.7533333333333333


In [80]:
from sklearn.linear_model import LogisticRegression
logRegg=LogisticRegression(penalty='l2')
logRegg.fit(Comment_bagofwords_train,x_label)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [81]:
print('Testing Accuracy on Training Set:',logRegg.score(Comment_bagofwords_train,x_label))
print('Testing Accuracy on Test Set:',logRegg.score(Comment_bagofwords_test,y_label))

Testing Accuracy on Training Set: 0.9882943143812709
Testing Accuracy on Test Set: 0.7466666666666667


In [82]:
logRegg.predict_proba(tfidf_transformer_test)

array([[0.42470725, 0.57529275],
       [0.5647699 , 0.4352301 ],
       [0.60387943, 0.39612057],
       [0.61689231, 0.38310769],
       [0.45240903, 0.54759097],
       [0.6038814 , 0.3961186 ],
       [0.44104076, 0.55895924],
       [0.62434547, 0.37565453],
       [0.65617793, 0.34382207],
       [0.56034522, 0.43965478],
       [0.49260119, 0.50739881],
       [0.5128492 , 0.4871508 ],
       [0.52072769, 0.47927231],
       [0.55615908, 0.44384092],
       [0.44650995, 0.55349005],
       [0.56196904, 0.43803096],
       [0.61676584, 0.38323416],
       [0.62295279, 0.37704721],
       [0.55401985, 0.44598015],
       [0.62597652, 0.37402348],
       [0.47879891, 0.52120109],
       [0.51649109, 0.48350891],
       [0.65278127, 0.34721873],
       [0.7126288 , 0.2873712 ],
       [0.47972672, 0.52027328],
       [0.48135318, 0.51864682],
       [0.4723314 , 0.5276686 ],
       [0.66099955, 0.33900045],
       [0.66719152, 0.33280848],
       [0.78349373, 0.21650627],
       [0.

In [83]:
y_predict=logRegg.predict(tfidf_transformer_test)
confusion_matrix(y_label, y_predict)

array([[69,  7],
       [38, 36]], dtype=int64)