In [1]:
# Reload modules before executing user code
%reload_ext autoreload
# Reload all modules (except those excluded by %aimport)
%autoreload 2
# Show plots within this notebook
%matplotlib inline

# 1. Load training and test data into pandas dataframes

In [2]:
PATH='download/'
test_csv = f'{PATH}test.csv'
train_csv = f'{PATH}train.csv'
sample_submission_csv = f'{PATH}sample_submission.csv'

In [3]:
import pandas as pd

train_df = pd.read_csv(train_csv, na_filter=False)
test_df = pd.read_csv(test_csv, na_filter=False)
submission_df = pd.read_csv(sample_submission_csv, nrows=0) # copy column headers

# 2. Explore the data

The labels are all in the same scale and won't need to be standardized. Notice how a comment can have multiple labels, e.g. the comment below is both toxic and a threat. This looks like a multilabel text classification problem, which can be solved in a variety of ways.

**(1) Problem transformation methods**

Problem transformation transforms the multilabel input into a representation suitable for single-label classification methods.

* **Binary Relevance** - Independently train one binary classifier for each label. The drawback of this method is that it does not take into account label correlation.

* **Label Powerset** - Generate a new class for every combination of labels and then use multiclass classification. Unlike binary relevance, this method takes into account label correlation, but it leads to a large number of classes and fewer examples per class. 

* **Classifier Chains** - Based on Binary Relevance but predictions of binary classifiers are cascaded along a chain as additional features. This method takes into account label correlation but the order of classifiers in the chain changes results.

**(2) Algorithm adaptation methods**

Algorithm adaption extends existing single-label classifier algorithms to handle multilabel data directly.

In [4]:
train_df.loc[train_df['threat'] == 1].head(1)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
79,003217c3eb469ba9,Hi! I am back again!\nLast warning!\nStop undo...,1,0,0,1,0,0


In [5]:
train_df.shape

(159571, 8)

In [6]:
train_df.describe()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
count,159571.0,159571.0,159571.0,159571.0,159571.0,159571.0
mean,0.095844,0.009996,0.052948,0.002996,0.049364,0.008805
std,0.294379,0.099477,0.223931,0.05465,0.216627,0.09342
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0,0.0,0.0
75%,0.0,0.0,0.0,0.0,0.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0


In [7]:
train_df.info() # verify that are no missing values in our dataset

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
id               159571 non-null object
comment_text     159571 non-null object
toxic            159571 non-null int64
severe_toxic     159571 non-null int64
obscene          159571 non-null int64
threat           159571 non-null int64
insult           159571 non-null int64
identity_hate    159571 non-null int64
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


# 4. Separate target features (y) from input features (X) 


Use sklearn.model_selection.train_test_split to split training data into validation and train. 

In [8]:
from sklearn.model_selection import train_test_split 

X = train_df['comment_text']
y = train_df[['obscene','insult','toxic','severe_toxic','identity_hate','threat']]

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=1)
X_test = test_df['comment_text']

# 5. Create a TF-IDF matrix

Count how many times each word appears in the comments (term frequency) and multiply it by the context-adjusted weight of each word (inverse document frequency). Better explained here: https://www.quora.com/How-does-TfidfVectorizer-work-in-laymans-terms

In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

# create the transform
vectorizer = TfidfVectorizer()

# tokenize and build vocab with training data
X_train_tokenized = vectorizer.fit_transform(X_train)

# transform validation and test data to have the same shape
X_valid_tokenized = vectorizer.transform(X_valid)
X_test_tokenized = vectorizer.transform(X_test)

In [10]:
# examine the vocabulary and document-term matrix together
dt_matrix = pd.DataFrame(X_train_tokenized.toarray(), columns=vectorizer.get_feature_names())

In [11]:
dt_matrix.head(1)

Unnamed: 0,00,000,0000,00000,000000,0000000,0000000027,00000001,00000003,00000050,...,천리마군,칠지도,ﬂute,ａｎｏｎｔａｌｋ,ｃｏｍ,ｗｗｗ,ｳｨｷﾍﾟﾃﾞｨｱ,𐌰𐌹,𐌰𐌿,𐌴𐌹
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
dt_matrix.head(1).loc[:, (dt_matrix.head(1) != 0).any(axis=0)]

Unnamed: 0,allowed,attack,be,blocked,but,comments,definitely,editor,editors,if,...,that,the,their,they,this,to,ve,while,will,won
0,0.200271,0.184008,0.154242,0.299486,0.088722,0.159946,0.21692,0.165467,0.156545,0.171023,...,0.066082,0.049339,0.266064,0.223291,0.07312,0.161087,0.133756,0.15515,0.105123,0.18268


# 6. Problem transformation

Train one binary classifier for each label. This is called binary relevance. 

In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

logreg = LogisticRegression(C=6.0)

In [14]:
for label in y_train:
    y = y_train[label]
    logreg.fit(X_train_tokenized, y)
    y_pred = logreg.predict(X_train_tokenized)
    print("Training accuracy for {} comments is {}".format(label, accuracy_score(y, y_pred)))  
    y_prob_test = logreg.predict_proba(X_test_tokenized)[:, 1]
    submission_df[label] = y_prob_test 

Training accuracy for obscene comments is 0.9882261703327693
Training accuracy for insult comments is 0.983909882810052
Training accuracy for toxic comments is 0.9779250485680265
Training accuracy for severe_toxic comments is 0.9933963150968227
Training accuracy for identity_hate comments is 0.994814188130601
Training accuracy for threat comments is 0.9982687848593094


In [15]:
for label in y_valid:
    y = y_valid[label]
    y_pred = logreg.predict(X_valid_tokenized)
    print("Validation accuracy for {} comments is {}".format(label, accuracy_score(y, y_pred)))  

Validation accuracy for obscene comments is 0.9473288422371925
Validation accuracy for insult comments is 0.9510574964750117
Validation accuracy for toxic comments is 0.9039009869967101
Validation accuracy for severe_toxic comments is 0.9898167006109979
Validation accuracy for identity_hate comments is 0.990224032586558
Validation accuracy for threat comments is 0.9972113426288579


# 7. View results

In [16]:
# Prepare submission
submission_df['id'] = test_df['id'].tolist()
submission_df.head(1)

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,00001cee341fdb12,0.999915,0.178272,0.999215,0.09397,0.977419,0.252907


In [17]:
print(test_df.loc[submission_df['toxic'] > 0.5].head(10))

                  id                                       comment_text
0   00001cee341fdb12  Yo bitch Ja Rule is more succesful then you'll...
7   000247e83dcc1211                   :Dear god this site is horrible.
38  001068b809feee6b  " \n\n ==balance== \n This page has one senten...
48  0013fed3aeae76b7  DJ Robinson is gay as hell! he sucks his dick ...
50  001421530a1aa622  I have been perfectly civil in what quite clea...
56  0016b94c8b20ffa6  I WILL BURN YOU TO HELL IF YOU REVOKE MY TALK ...
59  0017d4d47894af05               :Fuck off, you anti-semitic cunt.  |
63  00199e012d99a8b9  Her body is perfect. Face, boobs, hips, all of...
70  001c86f5bceccb32  == Hello == \n\n Fuck off my Pagan you barebac...
74  001d2f65ea6f4163  " August 2006 (UTC) \n\n :::::A simple ""you'r...


In [18]:
print(test_df.loc[submission_df['id'] == '0016b94c8b20ffa6'].comment_text.values)

['I WILL BURN YOU TO HELL IF YOU REVOKE MY TALK PAGE ACCESS!!!!!!!!!!!!!']


In [19]:
print(submission_df.loc[submission_df['id'] == '0016b94c8b20ffa6'])

                  id     toxic  severe_toxic   obscene    threat    insult  \
56  0016b94c8b20ffa6  0.924701       0.07637  0.081055  0.357827  0.092477   

    identity_hate  
56       0.008958  


^ That looks about right

# 8. Save results to CSV for submission

In [20]:
submission_df.to_csv('submission.csv', index=False)