In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
from nltk.corpus import stopwords
stopwords = nltk.corpus.stopwords.words('english')
from nltk.stem.porter import PorterStemmer
wn = nltk.WordNetLemmatizer()
ps = nltk.PorterStemmer()

import re 
import scipy
from scipy import sparse
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge, Lasso, BayesianRidge
import xgboost

In [2]:
train = pd.read_csv("train.csv")
comm_score = pd.read_csv("comments_to_score.csv")

In [3]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0,0,0,0,0,0


In [4]:
train.shape

(159571, 8)

In [5]:
comm_score.head()

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether there is an """"anti-editorializing"""" policy here. There is, and it's called wikipedia:neutral point of view. It discusses at some length the case of what we should do when writing about a subject which most of us find repugnant. Whilst you're not like..."
1,732895,"Looks like be have an abuser , can you please look into this? thanks."
2,1139051,"I confess to having complete (and apparently blissful) ignorance of Jordan, but I've glanced at the article. Is this a woman or a soap opera!?. I don't think there was much to change in terms of the description of the various diseases. It is mentioned that she is famous for the size of her bre..."
3,1434512,"""\n\nFreud's ideas are certainly much discussed today, and I would be the first to agree that they must be grappled with and dealt with seriously, if only because of their currency. So Freud deserves a long and thorough article in Wikipedia. I believe that a balanced article would include A) e..."
4,2084821,It is not just you. This is a laundry list of stupid allegations scooped up from god-knows-where. Probably two-thirds of it has little basis in fact.


In [6]:
comm_score.shape

(7537, 2)

In [7]:

label_score = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in label_score:
    train[category] = train[category] * label_score[category]

train['score'] = train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

train['y'] = train['score']

min_len = (train['y'] > 0).sum()  # len of toxic comments
df_non_toxic = train[train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
train_new = pd.concat([train[train['y'] > 0], df_non_toxic])  # make new df
train_new.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.32,1.5,0.16,0.0,0.64,0.0,2.62,2.62
12,0005c987bdfc9d4b,"Hey... what is it..\n@ | talk .\nWhat is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?\n\nAsk Sityush to clean up his behavior than issue...",0.32,0.0,0.0,0.0,0.0,0.0,0.32,0.32


In [8]:
train_new.shape

(32450, 10)

In [9]:
train_new['y'].value_counts()

0.00    16225
0.32     5666
1.12     3800
0.48     1758
2.62     1738
0.96     1215
4.12      385
0.16      317
0.64      301
1.82      290
1.98      204
0.80      181
2.46      164
1.50       76
2.14       31
5.62       31
3.32       21
2.30       20
3.48       10
3.96       10
1.66        5
4.82        1
3.96        1
Name: y, dtype: int64

In [10]:
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It's just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page. He seems to care more about the formatting than the actual info.",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of """"types of accidents"""" -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no...",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember what page that's on?",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
train = train.rename(columns={'comment_text':'comment'})

In [12]:
def text_cleaning(text):
    
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [13]:
tqdm.pandas()
train['comment'] = train['comment'].progress_apply(text_cleaning)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=159571.0), HTML(value='')))




In [14]:
df = train.copy()

In [15]:
df.head()

Unnamed: 0,id,comment,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
0,0000997932d777bf,Explanation Why the edits made under my username Hardcore Metallica Fan were reverted They weren t vandalisms just closure on some GAs after I voted at New York Dolls FAC And please don t remove the template from the talk page since I m retired now 89 205 38 27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,D aww He matches this background colour I m seemingly stuck with Thanks talk 21 51 January 11 2016 UTC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,Hey man I m really not trying to edit war It s just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page He seems to care more about the formatting than the actual info,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,More I can t make any real suggestions on improvement I wondered if the section statistics should be later on or a subsection of types of accidents I think the references may need tidying so that they are all in the exact same format ie date format etc I can do that later on if no one else does ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,You sir are my hero Any chance you remember what page that s on,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [16]:
df.head()

Unnamed: 0,id,comment,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
0,0000997932d777bf,Explanation Why the edits made under my username Hardcore Metallica Fan were reverted They weren t vandalisms just closure on some GAs after I voted at New York Dolls FAC And please don t remove the template from the talk page since I m retired now 89 205 38 27,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,000103f0d9cfb60f,D aww He matches this background colour I m seemingly stuck with Thanks talk 21 51 January 11 2016 UTC,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,000113f07ec002fd,Hey man I m really not trying to edit war It s just that this guy is constantly removing relevant information and talking to me through edits instead of my talk page He seems to care more about the formatting than the actual info,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0001b41b1c6bb37e,More I can t make any real suggestions on improvement I wondered if the section statistics should be later on or a subsection of types of accidents I think the references may need tidying so that they are all in the exact same format ie date format etc I can do that later on if no one else does ...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0001d958c54c6e35,You sir are my hero Any chance you remember what page that s on,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
df['y'].value_counts()

0.00    143346
0.32      5666
1.12      3800
0.48      1758
2.62      1738
0.96      1215
4.12       385
0.16       317
0.64       301
1.82       290
1.98       204
0.80       181
2.46       164
1.50        76
5.62        31
2.14        31
3.32        21
2.30        20
3.96        10
3.48        10
1.66         5
3.96         1
4.82         1
Name: y, dtype: int64

In [18]:
df['y'].value_counts(normalize=True)

0.00    0.898321
0.32    0.035508
1.12    0.023814
0.48    0.011017
2.62    0.010892
0.96    0.007614
4.12    0.002413
0.16    0.001987
0.64    0.001886
1.82    0.001817
1.98    0.001278
0.80    0.001134
2.46    0.001028
1.50    0.000476
5.62    0.000194
2.14    0.000194
3.32    0.000132
2.30    0.000125
3.96    0.000063
3.48    0.000063
1.66    0.000031
3.96    0.000006
4.82    0.000006
Name: y, dtype: float64

In [19]:
min_len = (df['y'] >= 0.1).sum()
df_non_toxic = df[df['y'] == 0].sample(n=min_len * 2, random_state=42)
df = pd.concat([df[df['y'] >= 0.1], df_non_toxic])
df['y'].value_counts()

0.00    32450
0.32     5666
1.12     3800
0.48     1758
2.62     1738
0.96     1215
4.12      385
0.16      317
0.64      301
1.82      290
1.98      204
0.80      181
2.46      164
1.50       76
5.62       31
2.14       31
3.32       21
2.30       20
3.48       10
3.96       10
1.66        5
4.82        1
3.96        1
Name: y, dtype: int64

In [41]:
df.head()

Unnamed: 0,id,comment,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.32,1.5,0.16,0.0,0.64,0.0,2.62,2.62
12,0005c987bdfc9d4b,Hey what is it talk What is it an exclusive group of some WP TALIBANS who are good at destroying self appointed purist who GANG UP any one who asks them questions abt their ANTI SOCIAL and DESTRUCTIVE non contribution at WP Ask Sityush to clean up his behavior than issue me nonsensical warnings,0.32,0.0,0.0,0.0,0.0,0.0,0.32,0.32
16,0007e25b2121310b,Bye Don t look come or think of comming back Tosser,0.32,0.0,0.0,0.0,0.0,0.0,0.32,0.32
42,001810bf8c45bf5f,You are gay or antisemmitian Archangel WHite Tiger Meow Greetingshhh Uh there are two ways why you do erased my comment about WW2 that holocaust was brutally slaying of Jews and not gays Gypsys Slavs anyone 1 If you are anti semitian than shave your head bald and go to the skinhead meetings 2 If...,0.32,0.0,0.16,0.0,0.64,1.5,2.62,2.62
43,00190820581d90ce,FUCK YOUR FILTHY MOTHER IN THE ASS DRY,0.32,0.0,0.16,0.0,0.64,0.0,1.12,1.12


In [20]:
vec = TfidfVectorizer(sublinear_tf=True, min_df= 3, max_df=0.5,max_features=50000)
X = vec.fit_transform(df['comment'])
X

<48675x25206 sparse matrix of type '<class 'numpy.float64'>'
	with 1774436 stored elements in Compressed Sparse Row format>

In [21]:
model = Ridge(alpha=0.5)
model.fit(X, df['y'])

Ridge(alpha=0.5)

In [31]:
#model = xgboost.XGBRegressor()
model = xgboost.XGBRegressor(n_estimators=1000, max_depth=7, eta=0.1, subsample=0.7, colsample_bytree=0.8)
model.fit(X, df['y'])

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.8, eta=0.1, gamma=0,
             gpu_id=-1, importance_type='gain', interaction_constraints='',
             learning_rate=0.100000001, max_delta_step=0, max_depth=7,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=1000, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [24]:
val_data = pd.read_csv("validation_data.csv")
val_data.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!!!!!!!!??????????????????????????????????????????????????????!!!!!!=\nWHER IS YOUR SEXY PIC GONE FROM YOUR MAIN PAGE???????? put it back
1,188,"""And yes, people should recognize that but they usually don't. One of the first objections you hear directed at someone who says he favors free markets is invariably """"There's no such thing as a free market."""" Ridiculously trivial. ""","Daphne Guinness \n\nTop of the mornin' my favourite Alien Life Form. I'm sorry, I can't marry you because you are a poor commoner so, therefore, unimportant. Can I have my dresses back. I hope your fat Oirish arse hasn't stretched them."
2,82,"Western Media?\n\nYup, because every crime in the entire world is caused by the west. Racists.","""Atom you don't believe actual photos of masturbation is sexually explicit photos. I am sorry to say they are sexually explicit and so you said yourself that the disclaimer is used for that purpose. Also, Wikipedia itself is targeted for minors. In addition, you say that Wikipedia does not have ..."
3,347,"And you removed it! You numbskull! I don't care what you say anymore, this is my life! Go ahead with your own life, leave me alone! —","You seem to have sand in your vagina.\n\nMight want to wash that stinking hole out, bitch."
4,539,smelly vagina \n\nBluerasberry why don't you be a model for a cheesy blue vagina syndrome. A lot are experiencing this vaginal disease.,"hey \n\nway to support nazis, you racist"


In [25]:
tqdm.pandas()
val_data['less_toxic'] = val_data['less_toxic'].progress_apply(text_cleaning)
val_data['more_toxic'] = val_data['more_toxic'].progress_apply(text_cleaning)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30108.0), HTML(value='')))




HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=30108.0), HTML(value='')))




In [26]:
X_less_toxic = vec.transform(val_data['less_toxic'])
X_more_toxic = vec.transform(val_data['more_toxic'])

In [32]:
p_less = model.predict(X_less_toxic)
p_more = model.predict(X_more_toxic)

In [33]:
tqdm.pandas()
comm_score['text'] = comm_score['text'].progress_apply(text_cleaning)

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=7537.0), HTML(value='')))




In [36]:
X_test = vec.transform(comm_score['text'])
p_test = model.predict(X_test)

In [37]:
comm_score['score'] = p_test

In [38]:
comm_score['score'].count()

7537

In [39]:
comm_score[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [40]:
comm_score.head()

Unnamed: 0,comment_id,text,score
0,114890,Gjalexei you asked about whether there is an anti editorializing policy here There is and it s called wikipedia neutral point of view It discusses at some length the case of what we should do when writing about a subject which most of us find repugnant Whilst you re not likely to get too many de...,-0.020737
1,732895,Looks like be have an abuser can you please look into this thanks,0.011748
2,1139051,I confess to having complete and apparently blissful ignorance of Jordan but I ve glanced at the article Is this a woman or a soap opera I don t think there was much to change in terms of the description of the various diseases It is mentioned that she is famous for the size of her breasts am I ...,0.002676
3,1434512,Freud s ideas are certainly much discussed today and I would be the first to agree that they must be grappled with and dealt with seriously if only because of their currency So Freud deserves a long and thorough article in Wikipedia I believe that a balanced article would include A explanations ...,0.025596
4,2084821,It is not just you This is a laundry list of stupid allegations scooped up from god knows where Probably two thirds of it has little basis in fact,0.4376
