In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse
%matplotlib inline
seed = 2390

In [90]:
# read data
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")

In [32]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,22256635,"Nonsense? kiss off, geek. what I said is true...",1,0,0,0,0,0
1,27450690,"""\n\n Please do not vandalize pages, as you di...",0,0,0,0,0,0
2,54037174,"""\n\n """"Points of interest"""" \n\nI removed the...",0,0,0,0,0,0
3,77493077,Asking some his nationality is a Racial offenc...,0,0,0,0,0,0
4,79357270,The reader here is not going by my say so for ...,0,0,0,0,0,0


In [33]:
train.isnull().sum()

id               0
comment_text     0
toxic            0
severe_toxic     0
obscene          0
threat           0
insult           0
identity_hate    0
dtype: int64

In [34]:
test.head()

Unnamed: 0,id,comment_text
0,6044863,==Orphaned non-free media (Image:41cD1jboEvL. ...
1,6102620,::Kentuckiana is colloquial. Even though the ...
2,14563293,"Hello fellow Wikipedians,\nI have just modifie..."
3,21086297,"AKC Suspensions \nThe Morning Call - Feb 24, 2..."
4,22982444,== [WIKI_LINK: Talk:Celts] ==


In [36]:
test.isnull().sum()

id              0
comment_text    1
dtype: int64

In [91]:
len(test)

226998

In [92]:
test.fillna("__na__", inplace = True)

In [93]:
len(test)

226998

In [38]:
test.isnull().sum()

id              0
comment_text    0
dtype: int64

### Preprocessing: 
#### #1: remove punctuation + stop words

In [87]:
import string
import re
from progressbar import ProgressBar

In [111]:
def text_preprocessing(sent):
    nopunc = re.sub(r'[^\w\s]','', sent)
    nostop = " ".join([word for word in nopunc.split() if word.lower() not in stopwords.words('english')])
    return nostop

In [107]:
x_train = train["comment_text"]

In [112]:
pbar = ProgressBar()
train_clean = []
for sent in pbar(x_train):
    train_clean.append(text_preprocessing(sent))

100% (95851 of 95851) |###################| Elapsed Time: 0:08:47 Time: 0:08:47


In [113]:
x_test = test["comment_text"]

In [114]:
pbar = ProgressBar()
test_clean = []
for sent in pbar(x_test):
    test_clean.append(text_preprocessing(sent))

100% (226998 of 226998) |#################| Elapsed Time: 0:22:39 Time: 0:22:39


In [115]:
len(test_clean)

226998

### Feature Extraction: Tfidf

In [137]:
vect_word = TfidfVectorizer(max_features=50000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
vect_char = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='char',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)

In [127]:
pbar = ProgressBar()
train_vect = vect_word.fit_transform(train["comment_text"])
#train_vect = vect_word.fit_transform(pbar(train_clean))


In [128]:
pbar = ProgressBar()
test_vect = vect_word.fit_transform(test["comment_text"])
#test_vect = vect_word.fit_transform(pbar(test_clean))

In [129]:
pbar = ProgressBar()
train_char = vect_char.fit_transform(train["comment_text"])
#train_char = vect_char.fit_transform(pbar(train_clean))

In [130]:
pbar = ProgressBar()
test_char = vect_char.fit_transform(test["comment_text"])
#test_char = vect_char.fit_transform(pbar(test_clean))

In [131]:
x_train_feat = sparse.hstack([train_vect, train_char])

In [132]:
x_test_feat = sparse.hstack([test_vect, test_char])

In [133]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [134]:
y_train = train[list_classes]

### LR models

In [135]:
prd = np.zeros((x_test_feat.shape[0],y_train.shape[1]))
cv_score =[]
for i,col in enumerate(list_classes):
    lr = LogisticRegression(C=4,random_state = i)
    print('Building {} model for column:{''}'.format(i,col)) 
    lr.fit(x_train_feat,y_train[col])
    #cv_score.append(lr.score)
    prd[:,i] = lr.predict_proba(x_test_feat)[:,1]

Building 0 model for column:toxic
Building 1 model for column:severe_toxic
Building 2 model for column:obscene
Building 3 model for column:threat
Building 4 model for column:insult
Building 5 model for column:identity_hate


In [136]:
prd_1 = pd.DataFrame(prd,columns=y_train.columns)
submit = pd.concat([test['id'],prd_1],axis=1)
#submit.to_csv('toxic_lr.csv.gz',compression='gzip',index=False)
submit.to_csv('Results/toxic_lr.csv',index=False)
submit.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,6044863,0.027809,0.00423,0.002204,0.000684,0.002258,0.000596
1,6102620,0.003671,0.003226,0.003266,0.000486,0.00333,0.000974
2,14563293,0.006272,0.002756,0.004574,0.000567,0.00515,0.000677
3,21086297,0.014534,0.004562,0.004203,0.000673,0.00481,0.000995
4,22982444,0.024387,0.005798,0.004438,0.000619,0.007344,0.001088
