# Importing necessary packages

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
#import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfTransformer
from unidecode import unidecode
import re
from sklearn.pipeline import Pipeline
import pickle

### Cleaning Function

In [2]:
def clean_text(x):
    x_ascii = unidecode(x)
    x_clean = special_character_removal.sub('',x_ascii)
    return x_clean

# Loading training data

In [3]:
train_df= pd.read_csv('train.csv')

In [4]:
train_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\r\nWhy the edits made under my use...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\r\nMore\r\nI can't make any real suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Preprocessing and data cleaning 

In [6]:
data=train_df.loc[:,["comment_text","toxic"]]
special_character_removal = re.compile(r'[^A-Za-z\.\-\?\!\,\#\@\% ]',re.IGNORECASE)
data['clean_text'] = data['comment_text'].apply(lambda x: clean_text(str(x)))
data['clean_text']= data['clean_text'].fillna("something")

In [7]:
data.head()

Unnamed: 0,comment_text,toxic,clean_text
0,Explanation\r\nWhy the edits made under my use...,0,ExplanationWhy the edits made under my usernam...
1,D'aww! He matches this background colour I'm s...,0,Daww! He matches this background colour Im see...
2,"Hey man, I'm really not trying to edit war. It...",0,"Hey man, Im really not trying to edit war. Its..."
3,"""\r\nMore\r\nI can't make any real suggestions...",0,MoreI cant make any real suggestions on improv...
4,"You, sir, are my hero. Any chance you remember...",0,"You, sir, are my hero. Any chance you remember..."


# Splitting in training and testing data

In [8]:
x=data.iloc[:,2]
y=data.iloc[:,1]

x_train, x_test, y_train, y_test = train_test_split(x.values, y.values, test_size=0.33, random_state=42)

x_train.shape, y_train.shape, x_test.shape, y_test.shape

((106912,), (106912,), (52659,), (52659,))

# Features extraction and model training

In [None]:
text_clf = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
text_clf.fit(x_train, y_train)

# Saving trained model

In [None]:
# save the model to disk
import pickle
filename = 'nb_tfidf.sav'
pickle.dump(text_clf, open(filename, 'wb'))

# Results on testing data

In [None]:
pred = text_clf.predict(x_test)

# Calculate the accuracy score: score
score = metrics.accuracy_score(y_test, pred)
print ('Score using TFIDF is: ',score)
    # Calculate the confusion matrix: cm
cm = metrics.confusion_matrix(y_test, pred)
print(cm)

# New data

#### This will only be used for new data. This part is not needed for training the model

In [None]:
loaded_model = pickle.load(open('nb_tfidf.sav', 'rb'))

In [None]:
new_data= pd.read_csv('test.csv')
new_data=new_data.iloc[:10000,:]

new_data['clean_text'] = new_data['comment_text'].apply(lambda x: clean_text(str(x)))

new_data['clean_text']= new_data['clean_text'].fillna("something")
new_data=new_data.clean_text.values

predicted_new_data = text_clf.predict(new_data)


In [None]:
h_detected=[]
for i in range (0,len(predicted_new_data)):
    if predicted_new_data[i]==1:
        print (i)
        h_detected.append(new_data[i])