In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gc

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import log_loss,confusion_matrix,classification_report,roc_curve,auc

import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy import sparse
%matplotlib inline
seed = 2390

In [2]:
# read data
train = pd.read_csv("Dataset/train.csv")
test = pd.read_csv("Dataset/test.csv")

In [3]:
train.fillna("__na__", inplace = True)

In [4]:
test.fillna("__na__", inplace = True)

### Preprocess: Tfidf

In [6]:
vect_word = TfidfVectorizer(max_features=50000, lowercase=True, analyzer='word',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)
vect_char = TfidfVectorizer(max_features=20000, lowercase=True, analyzer='char',
                        stop_words= 'english',ngram_range=(1,3),dtype=np.float32)

In [7]:
train_vect = vect_word.fit_transform(train["comment_text"])
test_vect = vect_word.fit_transform(test["comment_text"])

In [8]:
train_char = vect_char.fit_transform(train["comment_text"])
test_char = vect_char.fit_transform(test["comment_text"])

In [9]:
X_train = sparse.hstack([train_vect, train_char])

In [10]:
X_test = sparse.hstack([test_vect, test_char])

In [11]:
list_classes = ["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]

In [12]:
y = train[list_classes]

### LR models

In [15]:
prd = np.zeros((X_test.shape[0],y.shape[1]))
cv_score =[]
for i,col in enumerate(list_classes):
    lr = LogisticRegression(C=4,random_state = i)
    print('Building {} model for column:{''}'.format(i,col)) 
    lr.fit(X_train,y[col])
    #cv_score.append(lr.score)
    prd[:,i] = lr.predict_proba(X_test)[:,1]

Building 0 model for column:toxic
Building 1 model for column:severe_toxic
Building 2 model for column:obscene
Building 3 model for column:threat
Building 4 model for column:insult
Building 5 model for column:identity_hate
