In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
import warnings
warnings.simplefilter("ignore", UserWarning)


from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, log_loss

In [4]:
train = pd.read_csv('../input/train.csv',nrows=50000)
test = pd.read_csv('../input/test.csv')
sample = pd.read_csv('../input/sample_submission.csv')

In [None]:
train['comment_text'] = train['comment_text'].fillna('missing')
test['comment_text']  = test['comment_text'].fillna('missing')

train['comment_text'] = train['comment_text'].str.lower()
test['comment_text'] = test['comment_text'].str.lower()

In [None]:
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

text_train, text_valid, y_train, y_valid = train_test_split(train.comment_text.values, train[labels],
                                                  random_state=42, test_size=0.1, shuffle=True)

In [None]:
#Building Basic Models
#TFIDF Text Vectorizer
wordvect = TfidfVectorizer(min_df=3,  max_features=20000, strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
                      ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')
charvect = TfidfVectorizer(min_df=3,  max_features=50000, strip_accents='unicode', analyzer='char',token_pattern=r'\w{1,}',
                      ngram_range=(2, 6), use_idf=1,smooth_idf=1,sublinear_tf=1, stop_words = 'english')

all_text = pd.concat([train['comment_text'], test['comment_text']])

#tfv.fit(list(xTrain) + list(xValid))
wordvect.fit(all_text)
charvect.fit(all_text)

word_train = wordvect.transform(text_train)
word_valid = wordvect.transform(text_valid)
word_test  = wordvect.transform(test.comment_text.values)

char_train = charvect.transform(text_train)
char_valid = charvect.transform(text_valid)
char_test  = charvect.transform(test.comment_text.values)

X_train = hstack([word_train, char_train])
X_valid = hstack([word_valid, char_valid])
X_test  = hstack([word_test, char_test])

In [None]:
# Fitting a simple Logistic Regression on TFIDF
# --- Parameter Tuning ---
# --- Added Class Weight : Score Improved ---
# --- Changed Penalty to l1 : No improvement ---
# --- CV : No improvement ---

pred_train = pd.DataFrame()
pred_valid = pd.DataFrame()
pred_test = pd.DataFrame()

pred_test.ix[:, 'id'] = sample['id']
#model = LogisticRegression(C= 1, dual=False, n_jobs=-1,)
#cList = [0.001, 0.01, 0.1, 1, 10, 100] 
#clf = LogisticRegressionCV(Cs=cList, class_weight='balanced')
#model = GridSearchCV(estimator=clf, param_grid=param_grid,
#                                 verbose=10, n_jobs=-1, iid=True, refit=True, cv=2)
for label in labels:
    param_grid1 = {'class_weight':['balanced', None],
                  'C':[0.1, 0.5, 1, 1.5]}
    param_grid2 = {'dual':[False, True]}
    model = GridSearchCV(estimator=LogisticRegression(class_weight='balanced', 
                                                      n_jobs=-1,
                                                     C=1,
                                                     ),
                        param_grid=param_grid2,
                        scoring='roc_auc',
                        iid=False,
                        cv=3,)

    #clf.fit(xTrainTfv, yTrain[col])
    model.fit(X_train, y_train[label])
    print('Fitting ...', label)
    print(model.best_params_)
    #print('Optimal Value of C ...:', clf.C_)
    pred_train.ix[:, label] = model.predict_proba(X_train)[:, 1]
    pred_valid.ix[:, label] = model.predict_proba(X_valid)[:,1]
    pred_test.ix[:, label] = model.predict_proba(X_test)[:,1]


train_logloss = []
valid_logloss = []
valid_roc_auc = []
for label in labels:
    loss = log_loss(y_valid[label], pred_valid[label])
    train_loss = log_loss(y_train[label], pred_train[label])
    roc = roc_auc_score(y_valid[label], pred_valid[label])
    train_logloss.append(train_loss)
    valid_logloss.append(loss)
    valid_roc_auc.append(roc)


print('Training loss: ', np.mean(train_logloss))
print('Logistic loss :', np.mean(valid_logloss))
print('Logistic AUC :', np.mean(valid_roc_auc))

In [None]:
pred_test.to_csv('submission.csv', index=False)