Code by Xavier Conort - June 15

# Extract Value from text

We use here the dataset from the Kaggle Competition "Detecting Insults in Social Commentary" 
https://www.kaggle.com/c/detecting-insults-in-social-commentary

The goal is to learn how to produce document term matrices with Sklearn

# Read data

In [1]:
import pandas as pd
insults = pd.read_csv('insults_train.csv')
test_data = pd.read_csv("test_with_solutions.csv")

In [2]:
import numpy as np
y_train = np.array(insults.Insult)
comments_train = np.array(insults.Comment)

In [3]:
print(comments_train[0])
print("Insult: %d" % y_train[0])

"You fuck your dad."
Insult: 1


In [4]:
print(comments_train[5])
print("Insult: %d" % y_train[5])

"@SDL OK, but I would hope they'd sign him to a one-year contract to start with. Give him the chance to be reliable and productive, but give themselves the out if all his time off has hurt his playing skills or if he falls back into old habits."
Insult: 0


# Vectorizing the Data with bag of words

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

Use bag of words model as implemented in CountVectorizer.
Extracts a dictionary, then counts word occurences.

In [6]:
cv = CountVectorizer()
cv.fit(comments_train)
print(cv.get_feature_names()[:15])

['00', '000', '01', '014', '01k4wu4w', '02', '034', '05', '06', '0612', '07', '075', '08', '09', '0bama']


In [7]:
cv

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [8]:
print(cv.get_feature_names()[1000:1015])

['argue', 'argued', 'arguement', 'arguements', 'arguing', 'argument', 'arguments', 'aries', 'aristocracy', 'aritculett', 'arizona', 'arkan', 'arlington', 'arm', 'armando']


In [9]:
X_train = cv.transform(comments_train)
print("X_train.shape: %s" % str(X_train.shape))
print(X_train[0, :])

X_train.shape: (3947, 16469)
  (0, 3409)	1
  (0, 5434)	1
  (0, 16397)	1
  (0, 16405)	1


In [10]:
cv.transform(comments_train)

<3947x16469 sparse matrix of type '<class 'numpy.int64'>'
	with 100269 stored elements in Compressed Sparse Row format>

In [11]:
comments_test = np.array(test_data.Comment)
y_test = np.array(test_data.Insult)
X_test = cv.transform(comments_test)

Fit Regularized Logistic regression

In [12]:
param_grid = {'C': [0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100] }

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV



In [14]:
logreg = GridSearchCV(LogisticRegression(penalty='l2'), param_grid,
                      scoring='roc_auc', n_jobs=-1, cv=5)
logreg.fit(X_train, y_train)
logreg.grid_scores_

[mean: 0.82118, std: 0.00985, params: {'C': 0.01},
 mean: 0.84667, std: 0.00818, params: {'C': 0.03},
 mean: 0.86380, std: 0.00974, params: {'C': 0.1},
 mean: 0.87041, std: 0.01038, params: {'C': 0.3},
 mean: 0.87069, std: 0.01161, params: {'C': 1},
 mean: 0.86724, std: 0.01338, params: {'C': 3},
 mean: 0.86142, std: 0.01475, params: {'C': 10},
 mean: 0.85623, std: 0.01591, params: {'C': 30},
 mean: 0.85150, std: 0.01652, params: {'C': 100}]

In [15]:
from sklearn import metrics
logreg_pred = logreg.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_test, logreg_pred)

0.87100091424554071

# Try binary matrix

In [16]:
binary = CountVectorizer(binary=True)
binary.fit(comments_train)

CountVectorizer(analyzer='word', binary=True, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [17]:
Xb_train = binary.transform(comments_train).tocsr()

In [18]:
logreg_bin = GridSearchCV(LogisticRegression(penalty='l2'), param_grid,
                      scoring='roc_auc', n_jobs=-1, cv=5)
logreg_bin.fit(Xb_train, y_train)
logreg_bin.grid_scores_

[mean: 0.81432, std: 0.01566, params: {'C': 0.01},
 mean: 0.84606, std: 0.01163, params: {'C': 0.03},
 mean: 0.86499, std: 0.00918, params: {'C': 0.1},
 mean: 0.87296, std: 0.00961, params: {'C': 0.3},
 mean: 0.87389, std: 0.01195, params: {'C': 1},
 mean: 0.87055, std: 0.01355, params: {'C': 3},
 mean: 0.86492, std: 0.01524, params: {'C': 10},
 mean: 0.85929, std: 0.01682, params: {'C': 30},
 mean: 0.85389, std: 0.01807, params: {'C': 100}]

In [19]:
Xb_test = binary.transform(comments_test).tocsr()
logreg_bin_pred = logreg_bin.predict_proba(X_test)[:,1]
metrics.roc_auc_score(y_test, logreg_bin_pred)

0.87771522802229041

# Try tfidf

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [21]:
tfidf = TfidfVectorizer(binary=False)
tfidf.fit(comments_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [22]:
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [23]:
Xtfidf_train = tfidf.transform(comments_train).tocsr()

In [24]:
logreg_tfidf = GridSearchCV(LogisticRegression(penalty='l2'), param_grid,
                      scoring='roc_auc', n_jobs=-1, cv=5)
logreg_tfidf.fit(Xtfidf_train, y_train)
logreg_tfidf.grid_scores_

[mean: 0.82237, std: 0.01246, params: {'C': 0.01},
 mean: 0.83651, std: 0.01128, params: {'C': 0.03},
 mean: 0.84850, std: 0.01064, params: {'C': 0.1},
 mean: 0.86365, std: 0.00956, params: {'C': 0.3},
 mean: 0.87894, std: 0.00833, params: {'C': 1},
 mean: 0.88496, std: 0.00781, params: {'C': 3},
 mean: 0.88301, std: 0.00767, params: {'C': 10},
 mean: 0.87764, std: 0.00789, params: {'C': 30},
 mean: 0.87129, std: 0.00861, params: {'C': 100}]

In [25]:
Xtfidf_test = tfidf.transform(comments_test).tocsr()
logreg_tfidf_pred = logreg_tfidf.predict_proba(Xtfidf_test)[:, 1]
metrics.roc_auc_score(y_test, logreg_tfidf_pred)

0.89252556268932937

# Try 2-grams

In [26]:
tg = TfidfVectorizer(binary=False, ngram_range=(1, 2))
tg.fit(comments_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [27]:
print(tg.get_feature_names()[10000:10015])

['been put', 'been quite', 'been reading', 'been recalled', 'been reported', 'been rough', 'been saying', 'been shipments', 'been shown', 'been skateboarding', 'been spent', 'been spewing', 'been spoken', 'been spot', 'been subjected']


In [28]:
Xtg_train = tg.transform(comments_train).tocsr()

In [29]:
Xtg_train.shape

(3947, 94946)

In [30]:
logreg_tg = GridSearchCV(LogisticRegression(penalty='l2'), param_grid,
                      scoring='roc_auc', n_jobs=-1, cv=5)
logreg_tg.fit(Xtg_train, y_train)
logreg_tg.grid_scores_

[mean: 0.82298, std: 0.01264, params: {'C': 0.01},
 mean: 0.83473, std: 0.01177, params: {'C': 0.03},
 mean: 0.84259, std: 0.01177, params: {'C': 0.1},
 mean: 0.85396, std: 0.01108, params: {'C': 0.3},
 mean: 0.86976, std: 0.01028, params: {'C': 1},
 mean: 0.88002, std: 0.00974, params: {'C': 3},
 mean: 0.88379, std: 0.00988, params: {'C': 10},
 mean: 0.88356, std: 0.01037, params: {'C': 30},
 mean: 0.88250, std: 0.01086, params: {'C': 100}]

In [31]:
Xtg_test = tg.transform(comments_test).tocsr()
logreg_tg_pred = logreg_tg.predict_proba(Xtg_test)[:, 1]
metrics.roc_auc_score(y_test, logreg_tg_pred)

0.8923630957919596

In [32]:
metrics.roc_auc_score(y_test, logreg_tg_pred + logreg_tfidf_pred)

0.89479271439353347

# What are the most predictive words!

In [33]:
word_effect = pd.DataFrame({'token': tg.get_feature_names(), 'coef': logreg_tg.best_estimator_.coef_[0, :]})

In [34]:
word_effect.sort(['coef'], ascending=[0], inplace=True)

  if __name__ == '__main__':


In [35]:
word_effect.values[:20,:]

array([[12.416189971105046, 'you'],
       [10.256220545945574, 'idiot'],
       [8.81873330710303, 'dumb'],
       [7.955844294849619, 'stupid'],
       [7.919482983452711, 'your'],
       [7.810630407119215, 'moron'],
       [7.166410636070209, 'bitch'],
       [6.900402729900346, 'loser'],
       [5.482176660601994, 'faggot'],
       [4.97877557475419, 'ass'],
       [4.714429044580859, 'ignorant'],
       [4.641507722190839, 'shut'],
       [4.551664890521922, 'retard'],
       [4.454888508853563, 'retarded'],
       [4.363968687601755, 'dick'],
       [4.2614515280532235, 'racist'],
       [4.229291699585234, 'mouth'],
       [4.13241146402046, 'life'],
       [3.8930077775804417, 'yourself'],
       [3.7088744890434016, 'are an']], dtype=object)

In [36]:
word_effect.sort(['coef'], ascending=[1], inplace=True)

  if __name__ == '__main__':


In [37]:
word_effect.values[:20,:]

array([[-3.776566321504673, 'they'],
       [-3.5478999966985287, 'the'],
       [-3.39058037294392, 'right'],
       [-3.2085986064882706, 'are you'],
       [-2.8867911280865264, 'are right'],
       [-2.8287816441317446, 'better'],
       [-2.7242608948756035, 'game'],
       [-2.662096421042112, 'he'],
       [-2.589115095555307, 'if'],
       [-2.5090643226004494, 'love'],
       [-2.480269021678729, 'yes'],
       [-2.4355628104796088, 'yes you'],
       [-2.2504250439290803, 'xc2'],
       [-2.1630817192475438, 'me'],
       [-2.1161505968234375, 'two'],
       [-2.115004391128331, 'xc2 xa0'],
       [-2.113578460512195, 'if you'],
       [-1.9482827450863072, 'fans'],
       [-1.9325152885564196, 'obama'],
       [-1.9241728020907845, 'by']], dtype=object)

In [38]:
tc = TfidfVectorizer(binary=True, analyzer='char', ngram_range=(1, 5))
tc.fit(comments_train)
Xtc_train = tc.transform(comments_train).tocsr()
logreg_tc = GridSearchCV(LogisticRegression(penalty='l2'), param_grid,
                      scoring='roc_auc', n_jobs=-1, cv=5)
logreg_tc.fit(Xtc_train, y_train)
logreg_tc.grid_scores_

[mean: 0.81765, std: 0.02240, params: {'C': 0.01},
 mean: 0.84931, std: 0.01668, params: {'C': 0.03},
 mean: 0.86701, std: 0.01248, params: {'C': 0.1},
 mean: 0.88345, std: 0.00943, params: {'C': 0.3},
 mean: 0.89961, std: 0.00748, params: {'C': 1},
 mean: 0.90690, std: 0.00653, params: {'C': 3},
 mean: 0.90815, std: 0.00618, params: {'C': 10},
 mean: 0.90722, std: 0.00599, params: {'C': 30},
 mean: 0.90598, std: 0.00590, params: {'C': 100}]

In [39]:
Xtc_test = tc.transform(comments_test).tocsr()
logreg_tc_pred = logreg_tc.predict_proba(Xtc_test)[:, 1]
metrics.roc_auc_score(y_test, logreg_tc_pred)

0.91387038981716573

In [40]:
Xtc_train.shape

(3947, 222886)

# Try lasso

In [41]:
logreg1_tc = GridSearchCV(LogisticRegression(penalty='l1'), param_grid,
                      scoring='roc_auc', n_jobs=-1, cv=5)
logreg1_tc.fit(Xtc_train, y_train)
logreg1_tc.grid_scores_

[mean: 0.50000, std: 0.00000, params: {'C': 0.01},
 mean: 0.50000, std: 0.00000, params: {'C': 0.03},
 mean: 0.50000, std: 0.00000, params: {'C': 0.1},
 mean: 0.73412, std: 0.01197, params: {'C': 0.3},
 mean: 0.85026, std: 0.01456, params: {'C': 1},
 mean: 0.88673, std: 0.00937, params: {'C': 3},
 mean: 0.88685, std: 0.00811, params: {'C': 10},
 mean: 0.88039, std: 0.01171, params: {'C': 30},
 mean: 0.87769, std: 0.01262, params: {'C': 100}]

In [42]:
logreg1_tc_pred = logreg1_tc.predict_proba(Xtc_test)[:, 1]
metrics.roc_auc_score(y_test, logreg1_tc_pred)

0.89020560924348024

# Get vars with non null coeffs

In [48]:
coef_l1 = logreg1_tc.best_estimator_.coef_
non_null_cols = [i for i, j in enumerate(list(coef_l1.ravel())) if j!=0]

# Reduce Xtc_train

In [61]:
reducted_Xtc_train = Xtc_train.tocsc()[:, non_null_cols]
reducted_Xtc_train.shape

In [66]:
param_grid = {'C': [0.01, 0.03, 0.1] }
logreg2_rtc = GridSearchCV(LogisticRegression(penalty='l2'), param_grid,
                      scoring='roc_auc', n_jobs=-1, cv=5)
logreg2_rtc.fit(reducted_Xtc_train, y_train)
logreg2_rtc.grid_scores_

[mean: 0.89191, std: 0.01181, params: {'C': 0.01},
 mean: 0.89668, std: 0.00986, params: {'C': 0.03},
 mean: 0.89899, std: 0.00950, params: {'C': 0.1}]

In [67]:
logreg2_rtc_pred = logreg2_rtc.predict_proba(Xtc_test.tocsc()[:, non_null_cols])[:, 1]
metrics.roc_auc_score(y_test, logreg2_rtc_pred)

0.87745121931406478

# Try Xgboost

In [72]:
import xgboost as xgb
from sklearn.cross_validation import train_test_split

clf = xgb.XGBClassifier(n_estimators=1000,learning_rate=0.02, gamma=0, max_depth=2,
                        colsample_bytree=0.3)
Xtr, Xte, ytr, yte = train_test_split(reducted_Xtc_train, y_train, random_state=0)
clf.fit(Xtr, ytr, early_stopping_rounds=10, eval_metric="auc",
        eval_set=[(Xte, yte)])

[0]	validation_0-auc:0.703775
Will train until validation_0-auc hasn't improved in 10 rounds.
[1]	validation_0-auc:0.733647
[2]	validation_0-auc:0.734654
[3]	validation_0-auc:0.743753
[4]	validation_0-auc:0.762159
[5]	validation_0-auc:0.767472
[6]	validation_0-auc:0.769677
[7]	validation_0-auc:0.776548
[8]	validation_0-auc:0.774987
[9]	validation_0-auc:0.775049
[10]	validation_0-auc:0.774743
[11]	validation_0-auc:0.775163
[12]	validation_0-auc:0.780871
[13]	validation_0-auc:0.779082
[14]	validation_0-auc:0.777765
[15]	validation_0-auc:0.779028
[16]	validation_0-auc:0.780607
[17]	validation_0-auc:0.781819
[18]	validation_0-auc:0.782414
[19]	validation_0-auc:0.781813
[20]	validation_0-auc:0.78397
[21]	validation_0-auc:0.784042
[22]	validation_0-auc:0.783131
[23]	validation_0-auc:0.784079
[24]	validation_0-auc:0.784995
[25]	validation_0-auc:0.785287
[26]	validation_0-auc:0.78623
[27]	validation_0-auc:0.785484
[28]	validation_0-auc:0.785386
[29]	validation_0-auc:0.786916
[30]	validation_0-

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=1000, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [73]:
clf = xgb.XGBClassifier(n_estimators=732,learning_rate=0.02, gamma=0, max_depth=2,
                        colsample_bytree=0.3)

clf.fit(reducted_Xtc_train, y_train)

XGBClassifier(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.3,
       gamma=0, learning_rate=0.02, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=732, nthread=-1,
       objective='binary:logistic', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1)

In [75]:
clf_pred = clf.predict_proba(Xtc_test.tocsc()[:, non_null_cols])[:, 1]
metrics.roc_auc_score(y_test, clf_pred)

0.89559877175025604

In [76]:
metrics.roc_auc_score(y_test, clf_pred+logreg1_tc_pred)

0.90543983481547463