# Importing necessary packages

In [14]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from unidecode import unidecode
import re
from sklearn.pipeline import Pipeline
import pickle

### Cleaning Function

In [3]:
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

# Loading training data

In [4]:
train_df= pd.read_csv('train.csv')

# Preprocessing and data cleaning 

In [5]:
data=train_df.loc[:,["comment_text","toxic"]]
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
data['clean_text'] = data['comment_text'].apply(lambda x: clean_text(str(x)))
data['clean_text']= data['clean_text'].fillna("something")

# Splitting in training and testing data

In [6]:
x=data.iloc[:,2]
y=data.iloc[:,1]

x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.33, random_state=42)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((106912,), (106912,), (52659,), (52659,))

# Features extraction and model training

In [8]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
text_clf.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

# Saving trained model

In [9]:
# save the model to disk
import pickle
filename = 'nb_tfidf.sav'
pickle.dump(text_clf, open(filename, 'wb'))

# Results on testing data

In [10]:
pred = text_clf.predict(x_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print ('Score using TFIDF is: ',score)
    # Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

Score using TFIDF is:  0.9157978693100894
[[47571     5]
 [ 4429   654]]


# New data

#### This will only be used for new data. This part is not needed for training the model

In [12]:
loaded_model = pickle.load(open('nb_tfidf.sav', 'rb'))

In [13]:
new_data= pd.read_csv('test.csv')
new_data=new_data.iloc[:10000,:]

new_data['clean_text'] = new_data['comment_text'].apply(lambda x: clean_text(str(x)))

new_data['clean_text']= new_data['clean_text'].fillna("something")
new_data=new_data.clean_text.values

predicted_new_data = text_clf.predict(new_data)


In [16]:
h_detected=[]
for i in range (0,len(predicted_new_data)):
    if predicted_new_data[i]==1:
        print (i)
        h_detected.append(new_data[i])

59
70
126
136
175
193
211
215
256
309
332
339
349
419
481
491
499
544
648
698
751
786
800
803
812
887
933
975
1012
1028
1030
1043
1107
1142
1153
1176
1242
1270
1284
1345
1347
1348
1365
1367
1394
1407
1415
1417
1436
1446
1521
1527
1555
1585
1640
1703
1760
1797
1841
1861
1906
1918
1938
1958
1984
2062
2085
2099
2111
2176
2179
2182
2374
2385
2408
2412
2467
2468
2502
2530
2592
2723
2727
2784
2837
2858
2880
2986
2993
2997
3086
3129
3151
3355
3377
3416
3433
3455
3475
3489
3520
3622
3664
3681
3706
3747
3786
3884
3903
3908
3910
3983
4033
4116
4145
4193
4230
4378
4407
4411
4442
4565
4578
4585
4720
4791
4803
4819
4930
4954
4966
4967
4972
4994
5014
5015
5075
5081
5083
5086
5161
5218
5223
5245
5267
5323
5405
5406
5472
5477
5560
5641
5692
5693
5706
5775
5786
5789
5803
5825
5858
5872
5919
5932
5947
6007
6049
6080
6189
6217
6326
6449
6458
6524
6540
6548
6567
6571
6587
6631
6649
6653
6677
6728
6756
6792
6806
6845
6866
6882
6898
6903
6908
7031
7118
7133
7246
7276
7288
7337
7339
7343
7350
7378
7422
7473
