In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as ss
import re
import gc

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc, roc_auc_score

%matplotlib inline

In [2]:
DATA_PATH = '../data/'
SUBMISSION_PATH = '../sub/'

In [3]:
df_train = pd.read_csv(DATA_PATH+"train.csv", sep = ",", header=0)
df_test = pd.read_csv(DATA_PATH+"test.csv", sep = ",", header=0)

In [4]:
# load different spellings of bad words in a dictionary
# key: different spelling form, val: root form if any
def bad_word_normalize(file_path):
    bad_word = {}
    with open(file_path, "r") as f:
        for line in f:
            line = line.strip().lower().split(", ")
            if len(line) == 1:
                bad_word[line[0].strip()] = line[0].strip()
            elif len(line) == 2: # different spelling of bad words return to dict
                bad_word[line[0].strip()] = line[1].strip()
            else:
                print("badwords.txt contains error line at:\n {}".format(line))
    return(bad_word)

BAD_WORD_DICT = bad_word_normalize(DATA_PATH+"badwords.txt")

## Feature Engineering
1. **Number of (Unique) Words**
2. **Number of Puncuations**
3. **Number of Sentences**
4. **Number of Cap Words**
5. **Mis-spelling**
6. **Number of Bad Words**

In [22]:
def feature_engineer(df):
    
    # 1. number of words && number of unique words
    df['word_count'] = df['comment_text'].apply(lambda x: len(x.lower().split(' ')))
    df['uni_word_count'] = df['comment_text'].apply(lambda x: len(set(x.lower().split(' '))))

    # 2. number of puncuations: ! or ?
    punc_re = re.compile(r'[?!]+')
    df['punc_counts'] = df['comment_text'].apply(lambda x: len(punc_re.findall(x)))

    # 3. number of sentences
    sentence_re = re.compile(r'[.!?]+[ ]')
    df['sentence_count'] = df['comment_text'].apply(lambda x: len(sentence_re.split(x)))

    # 4. number of Caps
    cap_re = re.compile(r'\b[A-Z]{2,}\b') # all cap words with 2+ chars, (get rid of 'I', 'A')
    df['caps_count'] = df['comment_text'].apply(lambda x: len(cap_re.findall(x)))

    # 5. mis-spelling ?????
    
    # 6. number of bad words
    def count_bad_word(string):
        bad_word_set = BAD_WORD_SET
        string_ls = string.lower().split(' ')
        return sum([1 if word in bad_word_set else 0 for word in string_ls])
    
    BAD_WORD_SET = set(BAD_WORD_DICT.keys())
    df['badword_count'] = df['comment_text'].apply(count_bad_word)
    
    return(df)

In [23]:
df_train_test = pd.concat([df_train.iloc[:,:2], df_test.iloc[:,:2]], axis=0)
df_train_test = feature_engineer(df_train_test)

df_train_test.head(10)

Unnamed: 0,id,comment_text,word_count,uni_word_count,punc_counts,sentence_count,caps_count,badword_count
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,42,40,1,3,1,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,18,18,1,4,1,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",42,39,0,3,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",112,82,0,3,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",13,13,1,2,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",12,12,0,2,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,8,8,0,1,8,1
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,21,21,0,2,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,83,69,1,6,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,12,12,0,1,0,0


In [27]:
df_train_fe = pd.concat([df_train, df_train_test.iloc[:df_train.shape[0], 2:]], axis=1)
df_test_fe  = pd.concat([df_test, df_train_test.iloc[df_train.shape[0]:, 2:]], axis=1)

In [37]:
df_train_fe.head(20)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,word_count,uni_word_count,punc_counts,sentence_count,caps_count,badword_count
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0,42,40,1,3,1,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0,18,18,1,4,1,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0,42,39,0,3,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0,112,82,0,3,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0,13,13,1,2,0,0
5,00025465d4725e87,"""\n\nCongratulations from me as well, use the ...",0,0,0,0,0,0,12,12,0,2,0,0
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0,8,8,0,1,8,1
7,00031b1e95af7921,Your vandalism to the Matt Shirvington article...,0,0,0,0,0,0,21,21,0,2,0,0
8,00037261f536c51d,Sorry if the word 'nonsense' was offensive to ...,0,0,0,0,0,0,83,69,1,6,0,0
9,00040093b2687caa,alignment on this subject and which are contra...,0,0,0,0,0,0,12,12,0,1,0,0


In [39]:
y_lookup = {
            'toxic'        :2,
            'severe_toxic' :3,
            'obscene'      :4,
            'threat'       :5,
            'insult'       :6,
            'identity_hate':7
           }

In [63]:
for max_d in range(3,10):
    overall_auc_scores = []
    for tox_type, y_col in y_lookup.items():
        #print("================= Begin {} =================".format(tox_type))

        X = df_train_fe.iloc[:, 8:]
        y = df_train_fe.iloc[:,y_col]

        skf = StratifiedKFold(n_splits=5)
        auc_scores = []
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X.iloc[train_index,:], X.iloc[test_index,:]
            y_train, y_test = y[train_index], y[test_index]

            clf = RandomForestClassifier(max_depth=max_d, random_state=42)
            clf.fit(X_train, y_train)
            #print(clf.feature_importances_)
            y_pred = clf.predict_proba(X_test)
            auc_scores.append(roc_auc_score(y_test, y_pred[:,1]))
        print("AUC score: {}".format(np.mean(auc_scores)))
        overall_auc_scores.append(auc_scores)

        #print("================= Finish {} =================\n\n".format(tox_type))
    print("{} max depth overall auc score: {}".format(max_d, np.mean(overall_auc_scores)))


AUC score: 0.803624662787662
AUC score: 0.9448952450953394
AUC score: 0.8800860031661799
AUC score: 0.8258212554959684
AUC score: 0.846133919442677
AUC score: 0.865236243152389
3 max depth overall auc score: 0.8609662215233693
AUC score: 0.8136485466686134
AUC score: 0.9547079063832454
AUC score: 0.8878345181135128
AUC score: 0.8370271291283448
AUC score: 0.8528146560111708
AUC score: 0.872847484555343
4 max depth overall auc score: 0.869813373476705
AUC score: 0.8217965665435752
AUC score: 0.9559815461365835
AUC score: 0.8917592956888981
AUC score: 0.837896301813109
AUC score: 0.8569782247825399
AUC score: 0.8740969380601975
5 max depth overall auc score: 0.8730848121708171
AUC score: 0.8235644421207302
AUC score: 0.9584343470836874
AUC score: 0.8936033571813855
AUC score: 0.8441463181288074
AUC score: 0.858959659988242
AUC score: 0.8741927483038093
6 max depth overall auc score: 0.8754834788011102
AUC score: 0.8257543401065692
AUC score: 0.958667914760359
AUC score: 0.895531269034251

In [68]:
y_pred = df_test.iloc[:, 0]
for tox_type, y_col in y_lookup.items():
    X = df_train_fe.iloc[:, 8:]
    X_test = df_test_fe.iloc[:,2:]
    y = df_train_fe.iloc[:,y_col]
    
    clf = RandomForestClassifier(max_depth=7, random_state=42)
    clf.fit(X,y)
    
    y_prob = clf.predict_proba(X_test)
    y_pred = pd.concat([y_pred, pd.DataFrame(y_prob[:,1])], axis=1)
    
y_pred.columns = ['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
y_pred.to_csv(SUBMISSION_PATH+"rf_sub.csv", index=False, header= True)
y_pred.head()

Unnamed: 0,id,0,0.1,0.2,0.3,0.4,0.5
0,00001cee341fdb12,0.608227,0.145654,0.67818,0.091788,0.628984,0.153081
1,0000247867823ef7,0.069255,0.004033,0.026516,0.002744,0.0332,0.003915
2,00013b17ad220c46,0.077278,0.002753,0.026602,0.003213,0.032364,0.00473
3,00017563c3f7919a,0.041328,0.00083,0.009965,0.00082,0.017473,0.001733
4,00017695ad8997eb,0.089616,0.004682,0.047322,0.002972,0.040954,0.008928


In [82]:
nb_svm_sub = pd.read_csv(SUBMISSION_PATH+"nb_svm.csv",header=0)
nb_svm_sub.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999988,0.106264,0.999987,0.002369,0.962578,0.094955
1,0000247867823ef7,0.002873,0.000604,0.001893,0.0001,0.002227,0.000342
2,00013b17ad220c46,0.011755,0.000864,0.005588,0.000102,0.00321,0.000297
3,00017563c3f7919a,0.00096,0.000224,0.001141,0.000171,0.001057,0.000297
4,00017695ad8997eb,0.009957,0.000485,0.002009,0.000131,0.002395,0.000351


In [89]:
comb_sub = pd.concat([nb_svm_sub.iloc[:,0], nb_svm_sub.iloc[:,1:].apply(lambda x: x*0.9772) + y_pred.iloc[:,1:].apply(lambda x: x*0.8750)], axis=1)
comb_sub.iloc[:,1:] = comb_sub.iloc[:,1:].apply(lambda x: x/(0.9772+0.8750))
comb_sub.head()
comb_sub.to_csv(SUBMISSION_PATH+"comb_sub.csv", index=False, header = True)