TFIDF_Ridge_simple_baseline


Data from [Toxic Comment Classification Challenge](https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge)


In [None]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Traceback (most recent call last):
  File "/usr/local/bin/kaggle", line 5, in <module>
    from kaggle.cli import main
  File "/usr/local/lib/python3.7/dist-packages/kaggle/__init__.py", line 23, in <module>
    api.authenticate()
  File "/usr/local/lib/python3.7/dist-packages/kaggle/api/kaggle_api_extended.py", line 166, in authenticate
    self.config_file, self.config_dir))
OSError: Could not find kaggle.json. Make sure it's located in /root/.kaggle. Or use the environment method.


# Import Library

In [None]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
from bs4 import BeautifulSoup
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

import re 
import scipy
from scipy import sparse

from IPython.display import display
from pprint import pprint
from matplotlib import pyplot as plt 

import time
import scipy.optimize as optimize
import warnings
warnings.filterwarnings("ignore")
pd.options.display.max_colwidth=300
pd.options.display.max_columns = 100

from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from sklearn.linear_model import Ridge

# Prepare train data

In [None]:
df_train = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/train.csv")
# df_test = pd.read_csv("../input/jigsaw-unintended-bias-in-toxicity-classification/train.csv")
# df_test_label = pd.read_csv("../input/jigsaw-toxic-comment-classification-challenge/test_labels.csv").replace(-1,0)
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")

In [None]:
df_train.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"Explanation\nWhy the edits made under my username Hardcore Metallica Fan were reverted? They weren't vandalisms, just closure on some GAs after I voted at New York Dolls FAC. And please don't remove the template from the talk page since I'm retired now.89.205.38.27",0,0,0,0,0,0
1,000103f0d9cfb60f,"D'aww! He matches this background colour I'm seemingly stuck with. Thanks. (talk) 21:51, January 11, 2016 (UTC)",0,0,0,0,0,0


In [None]:
# Create a score that measure how much toxic is a comment
cat_mtpl = {'obscene': 0.16, 'toxic': 0.32, 'threat': 1.5, 
            'insult': 0.64, 'severe_toxic': 1.5, 'identity_hate': 1.5}

for category in cat_mtpl:
    df_train[category] = df_train[category] * cat_mtpl[category]

df_train['score'] = df_train.loc[:, 'toxic':'identity_hate'].sum(axis=1)

df_train['y'] = df_train['score']

min_len = (df_train['y'] > 0).sum()  # len of toxic comments
df_y0_undersample = df_train[df_train['y'] == 0].sample(n=min_len, random_state=201)  # take non toxic comments
df_train_new = pd.concat([df_train[df_train['y'] > 0], df_y0_undersample])  # make new df
df_train_new.head(2)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,score,y
6,0002bcb3da6cb337,COCKSUCKER BEFORE YOU PISS AROUND ON MY WORK,0.32,1.5,0.16,0.0,0.64,0.0,2.62,2.62
12,0005c987bdfc9d4b,"Hey... what is it..\n@ | talk .\nWhat is it... an exclusive group of some WP TALIBANS...who are good at destroying, self-appointed purist who GANG UP any one who asks them questions abt their ANTI-SOCIAL and DESTRUCTIVE (non)-contribution at WP?\n\nAsk Sityush to clean up his behavior than issue...",0.32,0.0,0.0,0.0,0.0,0.0,0.32,0.32


In [None]:
df_train = df_train.rename(columns={'comment_text':'text'})

<h3>Text Cleaning</h3>

In [None]:
def text_cleaning(text):
    '''
    Cleans text into a basic form for NLP. Operations include the following:-
    1. Remove special charecters like &, #, etc
    2. Removes extra spaces
    3. Removes embedded URL links
    4. Removes HTML tags
    5. Removes emojis
    
    text - Text piece to be cleaned.
    '''
    template = re.compile(r'https?://\S+|www\.\S+') #Removes website links
    text = template.sub(r'', text)
    
    soup = BeautifulSoup(text, 'lxml') #Removes HTML tags
    only_text = soup.get_text()
    text = only_text
    
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
    
    text = re.sub(r"[^a-zA-Z\d]", " ", text) #Remove special Charecters
    text = re.sub(' +', ' ', text) #Remove Extra Spaces
    text = text.strip() # remove spaces at the beginning and at the end of string

    return text

In [None]:
tqdm.pandas()
df_train['text'] = df_train['text'].progress_apply(text_cleaning)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [None]:
df = df_train.copy()

In [None]:
df['y'].value_counts()

0.00    143346
0.32      5666
1.12      3800
0.48      1758
2.62      1738
0.96      1215
4.12       385
0.16       317
0.64       301
1.82       290
1.98       204
0.80       181
2.46       164
1.50        76
2.14        31
5.62        31
3.32        21
2.30        20
3.48        10
3.96        10
1.66         5
3.96         1
4.82         1
Name: y, dtype: int64

# Undersampling

In [None]:
df['y'].value_counts(normalize=True)

0.00    0.898321
0.32    0.035508
1.12    0.023814
0.48    0.011017
2.62    0.010892
0.96    0.007614
4.12    0.002413
0.16    0.001987
0.64    0.001886
1.82    0.001817
1.98    0.001278
0.80    0.001134
2.46    0.001028
1.50    0.000476
2.14    0.000194
5.62    0.000194
3.32    0.000132
2.30    0.000125
3.48    0.000063
3.96    0.000063
1.66    0.000031
3.96    0.000006
4.82    0.000006
Name: y, dtype: float64

In [None]:
min_len = (df['y'] >= 0.1).sum()
df_y0_undersample = df[df['y'] == 0].sample(n=min_len, random_state=201)
df = pd.concat([df[df['y'] >= 0.1], df_y0_undersample])
df['y'].value_counts()

0.00    16225
0.32     5666
1.12     3800
0.48     1758
2.62     1738
0.96     1215
4.12      385
0.16      317
0.64      301
1.82      290
1.98      204
0.80      181
2.46      164
1.50       76
5.62       31
2.14       31
3.32       21
2.30       20
3.48       10
3.96       10
1.66        5
3.96        1
4.82        1
Name: y, dtype: int64

# TF-IDF

In [None]:
# vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,5))
# vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (3,6))
vec = TfidfVectorizer(min_df= 3, max_df=0.5, analyzer = 'char_wb', ngram_range = (2,5),max_features=46000)
X = vec.fit_transform(df['text'])
X

<32450x46000 sparse matrix of type '<class 'numpy.float64'>'
	with 13716685 stored elements in Compressed Sparse Row format>

<h1>Fit Ridge</h1>

In [None]:
model = Ridge(alpha=0.5)
# model = Ridge(alpha=0.485)
model.fit(X, df['y'])

Ridge(alpha=0.5)

# Prepare validation data

In [None]:
df_val = pd.read_csv("../input/jigsaw-toxic-severity-rating/validation_data.csv")

In [None]:
df_val.head()

Unnamed: 0,worker,less_toxic,more_toxic
0,313,This article sucks \n\nwoo woo wooooooo,WHAT!!!!!!!!?!?!!?!?!!?!?!?!?!!!!!!!!!!!!!!!!!!!!!!!!??????????????????????????????????????????????????????!!!!!!=\nWHER IS YOUR SEXY PIC GONE FROM YOUR MAIN PAGE???????? put it back
1,188,"""And yes, people should recognize that but they usually don't. One of the first objections you hear directed at someone who says he favors free markets is invariably """"There's no such thing as a free market."""" Ridiculously trivial. ""","Daphne Guinness \n\nTop of the mornin' my favourite Alien Life Form. I'm sorry, I can't marry you because you are a poor commoner so, therefore, unimportant. Can I have my dresses back. I hope your fat Oirish arse hasn't stretched them."
2,82,"Western Media?\n\nYup, because every crime in the entire world is caused by the west. Racists.","""Atom you don't believe actual photos of masturbation is sexually explicit photos. I am sorry to say they are sexually explicit and so you said yourself that the disclaimer is used for that purpose. Also, Wikipedia itself is targeted for minors. In addition, you say that Wikipedia does not have ..."
3,347,"And you removed it! You numbskull! I don't care what you say anymore, this is my life! Go ahead with your own life, leave me alone! —","You seem to have sand in your vagina.\n\nMight want to wash that stinking hole out, bitch."
4,539,smelly vagina \n\nBluerasberry why don't you be a model for a cheesy blue vagina syndrome. A lot are experiencing this vaginal disease.,"hey \n\nway to support nazis, you racist"


<h2>Text cleaning</h2>

In [None]:
# tqdm.pandas()
# df_val['less_toxic'] = df_val['less_toxic'].progress_apply(text_cleaning)
# df_val['more_toxic'] = df_val['more_toxic'].progress_apply(text_cleaning)

In [None]:
# X_less_toxic = vec.transform(df_val['less_toxic'])
# X_more_toxic = vec.transform(df_val['more_toxic'])

In [None]:
# p1 = model.predict(X_less_toxic)
# p2 = model.predict(X_more_toxic)

In [None]:
# # Validation Accuracy
# (p1 < p2).mean()

# Prepare submission data 

In [None]:
df_sub = pd.read_csv("../input/jigsaw-toxic-severity-rating/comments_to_score.csv")


<h2>Text cleaning</h2>

In [None]:
tqdm.pandas()
df_sub['text'] = df_sub['text'].progress_apply(text_cleaning)

  0%|          | 0/7537 [00:00<?, ?it/s]

<h2>Prediction</h2>

In [None]:
X_test = vec.transform(df_sub['text'])
p3 = model.predict(X_test)

In [None]:
df_sub['score'] = p3

In [None]:
df_sub['score'].count()

7537

In [None]:
df_sub['score'] = df_sub['score'] 

In [None]:
# 9 comments will fail if compared one with the other
df_sub['score'].nunique()

7517

<h2>Prepare submission file</h2>

In [None]:
df_sub[['comment_id', 'score']].to_csv("submission.csv", index=False)

In [None]:
df_sub

Unnamed: 0,comment_id,text,score
0,114890,Gjalexei you asked about whether there is an anti editorializing policy here There is and it s called wikipedia neutral point of view It discusses at some length the case of what we should do when writing about a subject which most of us find repugnant Whilst you re not likely to get too many de...,-0.034033
1,732895,Looks like be have an abuser can you please look into this thanks,0.258731
2,1139051,I confess to having complete and apparently blissful ignorance of Jordan but I ve glanced at the article Is this a woman or a soap opera I don t think there was much to change in terms of the description of the various diseases It is mentioned that she is famous for the size of her breasts am I ...,0.016812
3,1434512,Freud s ideas are certainly much discussed today and I would be the first to agree that they must be grappled with and dealt with seriously if only because of their currency So Freud deserves a long and thorough article in Wikipedia I believe that a balanced article would include A explanations ...,-0.105424
4,2084821,It is not just you This is a laundry list of stupid allegations scooped up from god knows where Probably two thirds of it has little basis in fact,0.264088
...,...,...,...
7532,504235362,Go away you annoying vandal,0.242439
7533,504235566,This user is a vandal,0.274774
7534,504308177,Sorry to sound like a pain but one by following me here is a tad stalking in all honesty but I m not too bothered about that But the main worry is that you have just assumed badly that I was referring about you when it is clear I wasn t One should never jump to conclusions without checking facts...,-0.351585
7535,504570375,Well it s pretty fucking irrelevant now I m unblocked aint it 81 108 7 13,0.264866
