In [11]:
import os
import sys
import numpy as np
import pandas as pd
from os.path import join
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
# from sklearn.naive_bayes import ComplementNB

sys.path.append(os.path.abspath('..'))
## Self Imports
from src.utils import *

In [12]:
import sklearn.naive_bayes

In [13]:
?sklearn.naive_bayes

[1;31mType:[0m        module
[1;31mString form:[0m <module 'sklearn.naive_bayes' from 'c:\\intelpython3\\lib\\site-packages\\sklearn\\naive_bayes.py'>
[1;31mFile:[0m        c:\intelpython3\lib\site-packages\sklearn\naive_bayes.py
[1;31mDocstring:[0m  
The :mod:`sklearn.naive_bayes` module implements Naive Bayes algorithms. These
are supervised learning methods based on applying Bayes' theorem with strong
(naive) feature independence assumptions.


In [153]:
train_path = join('..', 'Data', 'Raw', 'train.csv')
test_path = join('..', 'Data', 'Raw', 'test.csv')
test_labels_path = join('..', 'Data', 'Raw', 'test_labels.csv')

In [15]:
df = pd.read_csv(train_path)

In [16]:
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [17]:
feature = df['comment_text']

In [18]:
feature.head()

0    Explanation\nWhy the edits made under my usern...
1    D'aww! He matches this background colour I'm s...
2    Hey man, I'm really not trying to edit war. It...
3    "\nMore\nI can't make any real suggestions on ...
4    You, sir, are my hero. Any chance you remember...
Name: comment_text, dtype: object

In [118]:
labels = df.drop('comment_text', axis = 1)

In [119]:
labels.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,0,0,0,0,0,0
1,000103f0d9cfb60f,0,0,0,0,0,0
2,000113f07ec002fd,0,0,0,0,0,0
3,0001b41b1c6bb37e,0,0,0,0,0,0
4,0001d958c54c6e35,0,0,0,0,0,0


In [120]:
labels['toxic'].mean()

0.09584448302009764

# Using CountVectorizer

In [61]:
vectorizer1 = CountVectorizer(min_df = 0.001, stop_words='english')
X = vectorizer1.fit_transform(feature)

In [62]:
# print(vectorizer1.get_feature_names())

In [63]:
X.shape

(159571, 3647)

In [46]:
X.data.nbytes/1024/1024

37.866851806640625

In [40]:
type(X)

scipy.sparse.csr.csr_matrix

In [None]:
print(X.toarray())

In [41]:
# vectorizer1.get_feature_names()

# Multinomial NB with Count Vectorizer

In [110]:
xtr, xte, ytr, yte = train_test_split(X, label['toxic'])

In [111]:
clf = MultinomialNB()
clf.fit(xtr, ytr)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [68]:
train_pred = clf.predict(xtr)
train_actual = ytr.values
train_error = np.mean(abs(train_pred - train_actual))
accuracy = 1 - train_error

In [69]:
accuracy

0.9417938133992881

In [70]:
sensitivity(train_actual, train_pred)

0.6932402965547317

In [71]:
specificity(train_actual, train_pred)

0.968127674124181

In [72]:
negative_predictive_value(train_actual, train_pred)

0.9675196941291639

In [73]:
precision(train_actual, train_pred)

0.6973765025884004

In [74]:
f1_score(train_actual, train_pred)

0.6953022482722421

In [75]:
test_pred = clf.predict(xte)
test_actual = yte.values
test_error = np.mean(abs(test_pred - test_actual))
t_accuracy = 1 - test_error

In [76]:
t_accuracy

0.9402902764896097

In [77]:
sensitivity(test_actual, test_pred)

0.6782449725776966

In [78]:
specificity(test_actual, test_pred)

0.9681122448979592

In [79]:
negative_predictive_value(test_actual, test_pred)

0.9659160073037127

In [80]:
precision(test_actual, test_pred)

0.6930878035761943

In [81]:
f1_score(test_actual, test_pred)

0.6855860612460402

# Using TfidfVectorizer

In [82]:
vectorizer2 = TfidfVectorizer(min_df =0.001, stop_words = 'english')
Y = vectorizer2.fit_transform(feature)

In [83]:
Y.shape

(159571, 3647)

In [1]:
# print(vectorizer2.get_feature_names())

In [1]:
# print(Y.toarray())

# Multinomial NB with Tf-idf Vectorizer

In [84]:
y1, y2, z1, z2 = train_test_split(Y, label['toxic']

In [85]:
clf1 = MultinomialNB()
clf1.fit(y1, z1)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [86]:
train_pred1 = clf1.predict(y1)
train_actual1 = z1.values
train_error1 = np.mean(abs(train_pred1 - train_actual1))
accuracy1 = 1 - train_error1

In [87]:
accuracy1

0.9484533498220225

In [99]:
testing_df = pd.DataFrame(['You Suck', 'Hello Friends'])

In [103]:
testing_X = vectorizer2.transform(testing_df[0])

In [178]:
pred_result = clf1.predict_proba(testing_X)

In [179]:
pred_result

array([[0.04762862, 0.95237138],
       [0.8417012 , 0.1582988 ]])

In [181]:
pred_result.max()

0.9523713814835532

In [180]:
for x in range(len(pred_result)):
    m = pred_result.max()
print(m)

0.8417012014436243


In [88]:
test_pred1 = clf.predict(y2)
test_actual1 = z2.values
test_error1 = np.mean(abs(test_pred1 - test_actual1))
t_accuracy1 = 1 - test_error1

In [89]:
t_accuracy1

0.945654626124884

In [90]:
sensitivity(test_actual1, test_pred1)

0.4874397361075869

In [91]:
specificity(test_actual1, test_pred1)

0.9958834000890076

In [92]:
negative_predictive_value(test_actual1, test_pred1)

0.9465947546531303

In [93]:
precision(test_actual1, test_pred1)

0.9284678588690188

In [94]:
f1_score(test_actual1, test_pred1)

0.6392678868552413

In [None]:
def _gen_all_predictions(ft, lbs):
    
    vectorizer2 = TfidfVectorizer(min_df =0.001, stop_words = 'english')
    X = vectorizer2.fit_transform(ft)
    lb_proba = []
    for label in lbs.columns:
        y = lbs[label]
        Xtr, Xte, ytr, yte = train_test_split(X, y)
        clf = MultinomialNB()
        clf.fit(Xtr, ytr)
    
        # Make prediction on test set and return the highest probability
        pred_result = clf1.predict_proba(Xte)
        m = pred_result.max()

    lb_proba.append(m)
    
    return lb_proba

In [None]:
_gen_all_predictions(feature, labels)

In [160]:
labels.columns

Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')

In [None]:
# Creating a path to submission file
save_path = join('..', 'Results', 'simple_subm.csv')

In [None]:
# Write a Kaggle submission function
def write_kaggle_submission(model, test_set, save_path, ):
    prediction = _gen_all_predictions(feature, labels)
    
    submission_df = pd.DataFrame({'id': test_set['id'],  )

### Titanic's Kaggle submission 
save_path = join('..', 'Data', 'Titanic', 'submission_original_tuned.csv')

def write_kaggle_submission(model, test_set, save_path, train_cols):
    
    for i in train_cols: 
        if i not in test_set.columns:
            test_set[i] = 0
            
    prediction = model.predict(test_set.drop("PassengerId", axis = 1))
    
    submission_df = pd.DataFrame({"PassengerId": test_set['PassengerId'], 
                                  "Survived": prediction})
    submission_df.to_csv(save_path, index=False)
    return submission_df
    
write_kaggle_submission(reg, X_test, save_path, X_train.columns)