# Jigsaw Rate Severity of Toxic Comments

In [1]:
#https://www.kaggle.com/c/jigsaw-toxic-severity-rating
import pandas as pd 
from sklearn.model_selection import train_test_split
train = pd.read_csv('validation_data.csv') 

In [2]:
#Put sentences classified more or less toxic in separate dataframes and then combine them into one
train1 = pd.DataFrame(train['less_toxic'])
train1.columns = ['sentence']
train1['is_toxic'] = 0
train2 = pd.DataFrame(train['more_toxic'])
train2.columns = ['sentence']
train2['is_toxic'] = 1
train3 = pd.concat([train1,train2],ignore_index=True)

In [3]:
train3

Unnamed: 0,sentence,is_toxic
0,This article sucks \n\nwoo woo wooooooo,0
1,"""And yes, people should recognize that but the...",0
2,"Western Media?\n\nYup, because every crime in...",0
3,And you removed it! You numbskull! I don't car...,0
4,smelly vagina \n\nBluerasberry why don't you ...,0
...,...,...
60211,get out my large penis,1
60212,get out my large penis,1
60213,Piss off you slant eyed-gook,1
60214,Piss off you slant eyed-gook,1


In [4]:
#Make the actual training df, where there's every different sentence and the value count of it 
final_train = train3['sentence'].value_counts().rename_axis('sentence').reset_index(name='counts')

In [None]:
def getMean(sentence):
    this = train3.loc[train3['sentence'] == sentence]
    return sum(this.is_toxic)
sumof = final_train.apply(lambda x:getMean(x['sentence']), axis=1, result_type='expand')

In [None]:
#Add the percentage of the sentence being classified more toxic
final_train['mean'] = sumof
final_train['mean'] = final_train['mean'] / final_train['counts']

In [None]:
final_train.counts.value_counts()

In [None]:
#keep only sentences that occur 3+ times
final_train = final_train[final_train.counts > 3]

In [None]:
final_train.rename(columns={'sentence':'the_original_sentence','counts':'the_counts_of_sentences','mean':'the_percentage_for_the_sentence_being_more_toxic'},inplace=True)

In [None]:
#get a column for every word appearing
def toWords(sentence,words,count,mean):
    sent = set(sentence.split(" "))
    for word in sent:
        word = word.lower()
        word = word.strip("/n")
        word = ''.join(e for e in word if e.isalnum())
        if word not in words:
            words[word] = [1,count,count*mean]
        else:
            words[word][0] += 1
            words[word][1] += count
            words[word][2] += count*mean
words = {}
final_train.apply(lambda x:toWords(x['the_original_sentence'],words,x['the_counts_of_sentences'],x['the_percentage_for_the_sentence_being_more_toxic']), axis=1, result_type='expand')

In [None]:
#use only words which appear plenty and seem to be in almost only least or most toxic comments
for word in words:
    if words[word][0] >= 20 and (words[word][1] / words[word][2] < 0.34 or words[word][1] / words[word][2] > 0.67):
        final_train[word] = 0

In [None]:
final_train

In [None]:
final_train.dropna(inplace=True)
final_train

In [None]:
#fill columns about words appearing in the sentence with values of 1
def getCounts(sentence):
    sent = set(sentence.split(" "))
    for word in sent:
        word = word.lower()
        word = word.strip("/n")
        word = ''.join(e for e in word if e.isalnum())
        if word in final_train.columns:
            final_train.loc[final_train.the_original_sentence == sentence,word] = 1
final_train.apply(lambda x:getCounts(x['the_original_sentence']), axis=1, result_type='expand')

In [None]:
final_train

In [None]:
X_train = final_train.drop(columns=['the_original_sentence','the_counts_of_sentences','the_percentage_for_the_sentence_being_more_toxic'])
for i in X_train.columns:
    if sum(X_train.i) <= 2 or sum(X_train.i) == len(X_train):
        X_train = X_train.drop(columns=[i])
y_train = final_train['the_percentage_for_the_sentence_being_more_toxic']

In [None]:
#RandomForestRegressor, found optimized parameters by trying out different values
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=10000, max_depth=1000,max_features="sqrt", n_jobs=4, random_state=24)
model.fit(X_train, y_train)

In [None]:
#deep neural network made with Tensorflow, used the best performing parameters
model = build_and_compile_model(normalizer,'relu')
history = model.fit(X_train,y_train,validation_split=0.2,verbose=0, epochs=200)

In [None]:
#same with Ridge and Lasso models
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
ridge_alpha = 100
model = Ridge(alpha=ridge_alpha).fit(X_train, y_train)

In [None]:
lasso_alpha = 0.001
model = Lasso(alpha=lasso_alpha).fit(X_train, y_train)

In [None]:
subm = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv') 

In [None]:
subm.rename(columns={'text':'the_original_sentence'},inplace=True)
for i in X_train.columns:
    subm[i] = 0

In [None]:
#fill the test set similarly as the training one
def getCounts2(sentence):
    sent = set(sentence.split(" "))
    for word in sent:
        word = word.lower()
        word = word.strip("/n")
        word = ''.join(e for e in word if e.isalnum())
        if word in final_train.columns:
            subm.loc[subm.the_original_sentence == sentence,word] = 1
subm.apply(lambda x:getCounts2(x['the_original_sentence']), axis=1, result_type='expand')

In [None]:
X_test = subm.drop(columns=['the_original_sentence','comment_id'])

In [None]:
#RFR performed the best on the validation set, so it predicts values
#score on the private set was 0.726
model = RandomForestRegressor(n_estimators=10000, max_depth=1000,max_features="sqrt", n_jobs=4, random_state=24)
model.fit(X_train, y_train)
y_prob = model.predict(X_test)
y_prob = y_prob - min(y_prob) / max(y_prob) - min(y_prob)
y_prob = abs(y_prob) + 0.001
df = pd.DataFrame({'comment_id':subm['comment_id'],'score':y_prob.squeeze()})
df.to_csv('submission.csv',index=False)