In [1]:
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import re 

In [2]:
comment_to_score = pd.read_csv('../input/jigsaw-toxic-severity-rating/comments_to_score.csv')
comment_to_score.head()

Unnamed: 0,comment_id,text
0,114890,"""\n \n\nGjalexei, you asked about whether ther..."
1,732895,"Looks like be have an abuser , can you please ..."
2,1139051,I confess to having complete (and apparently b...
3,1434512,"""\n\nFreud's ideas are certainly much discusse..."
4,2084821,It is not just you. This is a laundry list of ...


In [3]:
comment_to_score.shape

(7537, 2)

In [4]:
val_data = pd.read_csv('../input/jigsaw-toxic-severity-rating/validation_data.csv')
val_data.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!...
1,188,"""And yes, people should recognize that but the...",Daphne Guinness \n\nTop of the mornin' my fav...
2,82,"Western Media?\n\nYup, because every crime in...","""Atom you don't believe actual photos of mastu..."
3,347,And you removed it! You numbskull! I don't car...,You seem to have sand in your vagina.\n\nMight...
4,539,smelly vagina \n\nBluerasberry why don't you ...,"hey \n\nway to support nazis, you racist"


In [5]:
val_data.shape

(30108, 3)

In [6]:
val_data['worker'].nunique()

753

In [7]:
train = pd.read_csv('../input/jigsaw-toxic-comment-classification-challenge/train.csv')
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [8]:
train['toxic'].mean()

0.09584448302009764

In [9]:
train['obscene'].mean()

0.052948217407925

In [10]:
train['insult'].mean()

0.04936360616904074

In [11]:
unlabelled_in_all = train[(train['toxic']!=1) & (train['severe_toxic']!=1) & (train['obscene']!=1) & 
                            (train['threat']!=1) & (train['insult']!=1) & (train['identity_hate']!=1)]

In [12]:
unlabelled_in_all['comment_text'][3]

'"\nMore\nI can\'t make any real suggestions on improvement - I wondered if the section statistics should be later on, or a subsection of ""types of accidents""  -I think the references may need tidying so that they are all in the exact same format ie date format etc. I can do that later on, if no-one else does first - if you have any preferences for formatting style on references or want to do it yourself please let me know.\n\nThere appears to be a backlog on articles for review so I guess there may be a delay until a reviewer turns up. It\'s listed in the relevant form eg Wikipedia:Good_article_nominations#Transport  "'

In [13]:
train[train['id'] == '0002bcb3da6cb337']

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,1,1,1,0,1,0


In [14]:
cat_mtpl = {'obscene': 0.64, 'toxic': 0.16, 'threat': 1.5, 
            'insult': 1.5, 'severe_toxic': 0.32, 'identity_hate': 1.5}

for category in cat_mtpl:
    train[category] = train[category] * cat_mtpl[category]
    
train['y'] = train.loc[:, 'toxic':'identity_hate'].sum(axis = 1)

In [15]:
min_len = (train['y']> 0).sum()
train_undersample = train[train['y'] == 0].sample(n=min_len, random_state=2020)
train_new = pd.concat([train[train['y'] > 0], train_undersample])

In [16]:
train_new.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,y
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.16,0.32,0.64,0.0,1.5,0.0,2.62
12,0005c987bdfc9d4b,Hey... what is it..\n@ | talk .\nWhat is it......,0.16,0.0,0.0,0.0,0.0,0.0,0.16
16,0007e25b2121310b,"Bye! \n\nDon't look, come or think of comming ...",0.16,0.0,0.0,0.0,0.0,0.0,0.16
42,001810bf8c45bf5f,You are gay or antisemmitian? \n\nArchangel WH...,0.16,0.0,0.64,0.0,1.5,1.5,3.8
43,00190820581d90ce,"FUCK YOUR FILTHY MOTHER IN THE ASS, DRY!",0.16,0.0,0.64,0.0,1.5,0.0,2.3


In [17]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [18]:
train_new['y'].value_counts()

0.00    16225
0.16     5666
2.30     3846
0.80     1758
1.66     1464
2.62      999
3.80      749
1.50      377
4.12      329
0.64      317
2.14      186
1.12      158
3.16      157
5.30       56
0.48       41
5.62       31
3.00       31
1.98       28
3.64       20
3.48        9
4.66        3
Name: y, dtype: int64

In [19]:
train_new['comment_text'] = train_new['comment_text'].apply(text_cleaning)

In [20]:
vect = TfidfVectorizer(min_df = 3, max_df = 0.5, analyzer = 'char_wb', ngram_range = (3, 5))
X_train = vect.fit_transform(train_new['comment_text'])

In [21]:
from sklearn.linear_model import Ridge

model = Ridge(alpha = 0.5)
model.fit(X_train, train_new['y'])

Ridge(alpha=0.5)

In [22]:
val_data['less_toxic'] = val_data['less_toxic'].apply(text_cleaning)
val_data['more_toxic'] = val_data['more_toxic'].apply(text_cleaning)

In [23]:
less_toxic_val = vect.transform(val_data['less_toxic'])
more_toxic_val = vect.transform(val_data['more_toxic'])

In [24]:
less_toxic_preds = model.predict(less_toxic_val)
more_toxic_preds = model.predict(more_toxic_val)

In [25]:
(less_toxic_preds < more_toxic_preds).mean()

0.6694566228244985

In [26]:
sub = pd.read_csv('../input/jigsaw-toxic-severity-rating/sample_submission.csv')
sub.head()

Unnamed: 0,comment_id,score
0,114890,0.5
1,732895,0.5
2,1139051,0.5
3,1434512,0.5
4,2084821,0.5


In [27]:
comment_to_score['text'] = comment_to_score['text'].apply(text_cleaning)
X_test = vect.transform(comment_to_score['text'])
sub['score'] = model.predict(X_test)
sub.head()

Unnamed: 0,comment_id,score
0,114890,0.153997
1,732895,0.244331
2,1139051,0.107895
3,1434512,-0.267452
4,2084821,0.320342


In [28]:
sub[['comment_id', 'score']].to_csv("submission.csv", index=False)