In [18]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import pickle

In [2]:
doc=pd.read_csv("../Datasets/formspring_data.csv",sep="\t")
doc.head()

Unnamed: 0,userid,post,ques,ans,asker,ans1,severity1,bully1,ans2,severity2,bully2,ans3,severity3,bully3
0,aguitarplayer94,Q: what&#039;s your favorite song? :D<br>A: I ...,what&#039;s your favorite song? :D<br>,I like too many songs to have a favorite,,No,0,,No,0,,No,0,
1,aprilpooh15,Q: <3<br>A: </3 ? haha jk! <33,<3,</3 ? haha jk! <33,,No,0,,No,0,,No,0,
2,aprilpooh15,Q: &quot;hey angel you duh sexy&quot;<br>A: R...,&quot;hey angel you duh sexy&quot;,Really?!?! Thanks?! haha,,No,0,,No,0,,No,0,
3,aprilpooh15,Q: (:<br>A: ;(,(:,;(,,No,0,,No,0,,No,0,
4,aprilpooh15,Q: ******************MEOWWW*******************...,******************MEOWWW*************************,*RAWR*?,,No,0,,No,0,,No,0,


In [3]:
#check dataset integrity
doc.isnull().sum()

userid           0
post             0
ques             2
ans              3
asker            0
ans1            42
severity1        1
bully1       10864
ans2            38
severity2        0
bully2       10967
ans3            45
severity3        0
bully3       10947
dtype: int64

In [4]:
#check for bias
doc['severity1'].value_counts()

0       11484
None      214
1         201
2         174
3         140
5         131
6         104
4          98
7          76
8          70
10         52
9          28
Name: severity1, dtype: int64

In [5]:
#cleaning nulls
doc['severity1'].fillna(0,inplace=True)
doc['ans1'].fillna('No',inplace=True)
doc['ans2'].fillna('No',inplace=True)
doc['ans3'].fillna('No',inplace=True)
doc['severity1'].replace('None',0,inplace=True)
doc['severity2'].replace('None',0,inplace=True)
doc['severity2'].replace('o',0,inplace=True)
doc['severity2'].replace('0`',0,inplace=True)
doc['severity2'].replace('n/a0',0,inplace=True)
doc['severity3'].replace('None',0,inplace=True)
doc['severity3'].replace('o',inplace=True)
doc['severity3'].replace('`0',inplace=True)
doc['severity3'].replace('N/a',inplace=True)
doc.isnull().sum()

userid           0
post             0
ques             2
ans              3
asker            0
ans1             0
severity1        0
bully1       10864
ans2             0
severity2        0
bully2       10967
ans3             0
severity3        0
bully3       10947
dtype: int64

In [6]:
dts=[]
for index,row in doc.iterrows():
    record=[]
    record.append(row['post'])
    if 'Yes' in [row['ans1'],row['ans2'],row['ans3']]:
        record.append(True)
    else:
        record.append(False)
    record.append(max(list(map(int,[row['severity1'],row['severity2'],row['severity3']]))))
    dts.append(list(record))

In [7]:
dts[5][2]

0

In [8]:
for i in range(len(dts)):
    dts[i][0]=BeautifulSoup(dts[i][0]).get_text()
    dts[i][0]=dts[i][0].replace('Q:','')
    dts[i][0]=dts[i][0].replace('A:','')
    dts[i][0]=dts[i][0].lower()

In [9]:
#-------------------------------------------------------
#End of data cleaning
#-------------------------------------------------------

In [10]:
ps=PorterStemmer()
for i in range(len(dts)):
        sentence=dts[i][0]
        word_list=word_tokenize(sentence)
        stemmed_sentence=[]
        for w in word_list:
            stemmed_sentence.append(ps.stem(w))
        dts[i][0]=' '.join(stemmed_sentence)        

In [11]:
#clean stop words using CountVectorizer itself
vect=CountVectorizer(stop_words='english',max_features=6000)
X=vect.fit_transform(np.array(dts)[:,0]).toarray()

In [12]:
y=np.array(dts)[:,1]
(y == 'True').astype(int)

array([0, 0, 0, ..., 0, 0, 1])

In [13]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.75,random_state=42)
np.shape(X_train)

(9579, 6000)

In [14]:
gnb=MultinomialNB()
gnb.fit(X_train,y_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [15]:
y_pred=gnb.predict(X_test)
cm=confusion_matrix(y_test,y_pred)

In [16]:
print(cm)

[[2492  223]
 [ 260  219]]


In [17]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

       False       0.91      0.92      0.91      2715
        True       0.50      0.46      0.48       479

    accuracy                           0.85      3194
   macro avg       0.70      0.69      0.69      3194
weighted avg       0.84      0.85      0.85      3194



In [19]:
with open('vectorizer.pickle','wb') as vec_pic:
    pickle.dump(vect,vec_pic)

In [20]:
with open('mnb.pickle','wb') as mnb_pic:
    pickle.dump(gnb,mnb_pic)