# Jigsaw Rate Severity of Toxic Comments

In [1]:
#https://www.kaggle.com/c/jigsaw-toxic-severity-rating
import pandas as pd 
from sklearn.model_selection import train_test_split
train = pd.read_csv('validation_data.csv') 

In [2]:
#Put sentences classified more or less toxic in separate dataframes and then combine them into one
train1 = pd.DataFrame(train['less_toxic'])
train1.columns = ['sentence']
train1['is_toxic'] = 0
train2 = pd.DataFrame(train['more_toxic'])
train2.columns = ['sentence']
train2['is_toxic'] = 1
train3 = pd.concat([train1,train2],ignore_index=True)

In [3]:
train3

Unnamed: 0,sentence,is_toxic
0,This article sucks \n\nwoo woo wooooooo,0
1,"""And yes, people should recognize that but the...",0
2,"Western Media?\n\nYup, because every crime in...",0
3,And you removed it! You numbskull! I don't car...,0
4,smelly vagina \n\nBluerasberry why don't you ...,0
...,...,...
60211,get out my large penis,1
60212,get out my large penis,1
60213,Piss off you slant eyed-gook,1
60214,Piss off you slant eyed-gook,1


In [4]:
#Make the actual training df, where there's every different sentence and the value count of it 
final_train = train3['sentence'].value_counts().rename_axis('sentence').reset_index(name='counts')

In [5]:
def getMean(sentence):
    this = train3.loc[train3['sentence'] == sentence]
    return sum(this.is_toxic)
sumof = final_train.apply(lambda x:getMean(x['sentence']), axis=1, result_type='expand')

In [6]:
#Add the percentage of the sentence being classified more toxic
final_train['mean'] = sumof
final_train['mean'] = final_train['mean'] / final_train['counts']

In [7]:
final_train.counts.value_counts()

3     9437
6     3581
9      840
12     149
1      109
4       77
7       26
15      25
10       3
18       3
13       1
Name: counts, dtype: int64

In [8]:
#keep only sentences that occur 3+ times
final_train = final_train[final_train.counts > 3]

In [9]:
final_train.rename(columns={'sentence':'the_original_sentence','counts':'the_counts_of_sentences','mean':'the_percentage_for_the_sentence_being_more_toxic'},inplace=True)

In [10]:
#get a column for every word appearing
def toWords(sentence,words,count,mean):
    sent = set(sentence.split(" "))
    for word in sent:
        word = word.lower()
        word = word.strip("/n")
        word = ''.join(e for e in word if e.isalnum())
        if word not in words:
            words[word] = [1,count,count*mean]
        else:
            words[word][0] += 1
            words[word][1] += count
            words[word][2] += count*mean
words = {}
final_train.apply(lambda x:toWords(x['the_original_sentence'],words,x['the_counts_of_sentences'],x['the_percentage_for_the_sentence_being_more_toxic']), axis=1, result_type='expand')

0       None
1       None
2       None
3       None
4       None
        ... 
4700    None
4701    None
4702    None
4703    None
4704    None
Length: 4705, dtype: object

In [11]:
#use only words which appear plenty and seem to be in almost only least or most toxic comments
for word in words:
    if words[word][0] >= 20 and (words[word][1] / words[word][2] < 0.34 or words[word][1] / words[word][2] > 0.67):
        final_train[word] = 0

In [12]:
final_train

Unnamed: 0,the_original_sentence,the_counts_of_sentences,the_percentage_for_the_sentence_being_more_toxic,Unnamed: 4,was,be,to,you,wrong,it,...,shall,manner,uk,azis,asking,league,career,vandalising,arbcom,july
0,sorry i jumped to conclusions \n\non christia...,18,0.277778,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,this irishtom guy is turning every article int...,18,0.333333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,You are not sorry one damned bit. You have ye...,18,0.388889,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,You just want to keep it because you are a pae...,15,0.466667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,YOU SUCK \n\nI HATE YOU> YOU CAN FUCK MY ASS ...,15,0.933333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,Thank you for the explanation. Followup quest...,4,0.500000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4701,free me \n\ni have been banned free me you ar...,4,0.000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4702,"""\nYou've been overtly harsh with it. No need....",4,0.250000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4703,"""""""But you are not entitled bring your sour gr...",4,0.250000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [13]:
final_train.dropna(inplace=True)
final_train

Unnamed: 0,the_original_sentence,the_counts_of_sentences,the_percentage_for_the_sentence_being_more_toxic,Unnamed: 4,was,be,to,you,wrong,it,...,shall,manner,uk,azis,asking,league,career,vandalising,arbcom,july
0,sorry i jumped to conclusions \n\non christia...,18,0.277778,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,this irishtom guy is turning every article int...,18,0.333333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,You are not sorry one damned bit. You have ye...,18,0.388889,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,You just want to keep it because you are a pae...,15,0.466667,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,YOU SUCK \n\nI HATE YOU> YOU CAN FUCK MY ASS ...,15,0.933333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,Thank you for the explanation. Followup quest...,4,0.500000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4701,free me \n\ni have been banned free me you ar...,4,0.000000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4702,"""\nYou've been overtly harsh with it. No need....",4,0.250000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4703,"""""""But you are not entitled bring your sour gr...",4,0.250000,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [14]:
#fill columns about words appearing in the sentence with values of 1
def getCounts(sentence):
    sent = set(sentence.split(" "))
    for word in sent:
        word = word.lower()
        word = word.strip("/n")
        word = ''.join(e for e in word if e.isalnum())
        if word in final_train.columns:
            final_train.loc[final_train.the_original_sentence == sentence,word] = 1
final_train.apply(lambda x:getCounts(x['the_original_sentence']), axis=1, result_type='expand')

0       None
1       None
2       None
3       None
4       None
        ... 
4700    None
4701    None
4702    None
4703    None
4704    None
Length: 4705, dtype: object

In [15]:
final_train

Unnamed: 0,the_original_sentence,the_counts_of_sentences,the_percentage_for_the_sentence_being_more_toxic,Unnamed: 4,was,be,to,you,wrong,it,...,shall,manner,uk,azis,asking,league,career,vandalising,arbcom,july
0,sorry i jumped to conclusions \n\non christia...,18,0.277778,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,this irishtom guy is turning every article int...,18,0.333333,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,You are not sorry one damned bit. You have ye...,18,0.388889,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
3,You just want to keep it because you are a pae...,15,0.466667,1,0,0,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4,YOU SUCK \n\nI HATE YOU> YOU CAN FUCK MY ASS ...,15,0.933333,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4700,Thank you for the explanation. Followup quest...,4,0.500000,1,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4701,free me \n\ni have been banned free me you ar...,4,0.000000,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4702,"""\nYou've been overtly harsh with it. No need....",4,0.250000,1,0,1,1,1,0,1,...,0,0,0,0,0,0,0,0,0,0
4703,"""""""But you are not entitled bring your sour gr...",4,0.250000,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
X_train = final_train.drop(columns=['the_original_sentence','the_counts_of_sentences','the_percentage_for_the_sentence_being_more_toxic'])
for i in X_train.columns:
    if sum(X_train.i) <= 2 or sum(X_train.i) == len(X_train):
        X_train = X_train.drop(columns=[i])
y_train = final_train['the_percentage_for_the_sentence_being_more_toxic']

In [17]:
#RandomForestRegressor, found optimized parameters by trying out different values
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10000, max_depth=1000,max_features="sqrt", n_jobs=4, random_state=24)
model.fit(X_train, y_train)

RandomForestRegressor(max_depth=1000, max_features='sqrt', n_estimators=10000,
                      n_jobs=4, random_state=24)

In [20]:
#same with Ridge and Lasso models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
ridge_alpha = 100
model = Ridge(alpha=ridge_alpha).fit(X_train, y_train)

In [21]:
lasso_alpha = 0.001
model = Lasso(alpha=lasso_alpha).fit(X_train, y_train)

In [22]:
subm = pd.read_csv('comments_to_score.csv') 

In [23]:
subm.rename(columns={'text':'the_original_sentence'},inplace=True)
for i in X_train.columns:
    subm[i] = 0

In [24]:
#fill the test set similarly as the training one
def getCounts2(sentence):
    sent = set(sentence.split(" "))
    for word in sent:
        word = word.lower()
        word = word.strip("/n")
        word = ''.join(e for e in word if e.isalnum())
        if word in final_train.columns:
            subm.loc[subm.the_original_sentence == sentence,word] = 1
subm.apply(lambda x:getCounts2(x['the_original_sentence']), axis=1, result_type='expand')

0       None
1       None
2       None
3       None
4       None
        ... 
7532    None
7533    None
7534    None
7535    None
7536    None
Length: 7537, dtype: object

In [25]:
X_test = subm.drop(columns=['the_original_sentence','comment_id'])

In [26]:
#RFR performed the best on the validation set, so it predicts values
#score on the private set was 0.726
model = RandomForestRegressor(n_estimators=10000, max_depth=1000,max_features="sqrt", n_jobs=4, random_state=24)
model.fit(X_train, y_train)
y_prob = model.predict(X_test)
y_prob = y_prob - min(y_prob) / max(y_prob) - min(y_prob)
y_prob = abs(y_prob) + 0.001
df = pd.DataFrame({'comment_id':subm['comment_id'],'score':y_prob.squeeze()})
df.to_csv('submission.csv',index=False)