In [12]:
!rm -r ~/.kaggle
!mkdir ~/.kaggle
!mv ./kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [15]:
!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge

Downloading jigsaw-toxic-comment-classification-challenge.zip to /content
 87% 46.0M/52.6M [00:00<00:00, 91.0MB/s]
100% 52.6M/52.6M [00:00<00:00, 92.6MB/s]


In [16]:
!unzip '/content/jigsaw-toxic-comment-classification-challenge.zip'

Archive:  /content/jigsaw-toxic-comment-classification-challenge.zip
  inflating: sample_submission.csv.zip  
  inflating: test.csv.zip            
  inflating: test_labels.csv.zip     
  inflating: train.csv.zip           


In [17]:
!unzip '/content/train.csv.zip'

Archive:  /content/train.csv.zip
  inflating: train.csv               


In [18]:
!unzip '/content/test.csv.zip'

Archive:  /content/test.csv.zip
  inflating: test.csv                


In [51]:
!unzip '/content/test_labels.csv.zip'

Archive:  /content/test_labels.csv.zip
  inflating: test_labels.csv         


## Import Libraries

In [58]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

## Data set Preparation

In [59]:
dataset = pd.read_csv('/content/train.csv')

In [60]:
dataset.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
119351,7e10902c78514aa2,Jeffpw and Isaac \n\nARE BOTH DEATH\n\nLOL!,1,0,0,0,0,0
87725,eaacef7f56a59ada,YOU'RE A KIKE ALRIGHT.,0,0,0,0,0,0
11342,1e0a67e5e212404c,There is a user by the name of Mike Rosoft who...,1,0,0,0,0,0
76306,cc48a5ea8531fa3b,facts at the time still stick,0,0,0,0,0,0
56371,969c621e9b9dbeb3,Pandæmonium is such a cool word.,0,0,0,0,0,0


In [61]:
def preprocess_text(text):
    text = re.sub(r'\d+', '', text) 
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\n', '', text)
    text = text.lower()
    
    return text

In [62]:
dataset['comment_text'] = dataset['comment_text'].apply(preprocess_text)

In [63]:
dataset.sample(5)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
139619,eb33593d60624332,the second quote is nearly incomprehensible p...,0,0,0,0,0,0
104800,30ae0b0b6dc48558,it has long been established that tabloids are...,0,0,0,0,0,0
51307,8947f2c12aea0a32,if noone beats me to it ill knock something up...,0,0,0,1,0,0
26438,460a03917de648a5,why not wait for all the details and tests to...,0,0,0,0,0,0
61892,a5a296e0bf373c63,one is a paki and the other is a hindu i feel ...,0,0,0,0,0,0


## X_train & y_train

In [64]:
X_train = dataset["comment_text"]
y_train = dataset[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [65]:
X_train

0         explanationwhy the edits made under my usernam...
1         daww he matches this background colour im seem...
2         hey man im really not trying to edit war its j...
3         morei cant make any real suggestions on improv...
4         you sir are my hero any chance you remember wh...
                                ...                        
159566    and for the second time of asking when your vi...
159567    you should be ashamed of yourself that is a ho...
159568    spitzer umm theres no actual article for prost...
159569    and it looks like it was actually you who put ...
159570    and  i really dont think you understand  i cam...
Name: comment_text, Length: 159571, dtype: object

In [66]:
y_train

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0,0,0,0,0,0
1,0,0,0,0,0,0
2,0,0,0,0,0,0
3,0,0,0,0,0,0
4,0,0,0,0,0,0
...,...,...,...,...,...,...
159566,0,0,0,0,0,0
159567,0,0,0,0,0,0
159568,0,0,0,0,0,0
159569,0,0,0,0,0,0


In [67]:
test_dataset = pd.read_csv('/content/test.csv')

In [68]:
test_dataset

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...
1,0000247867823ef7,== From RfC == \n\n The title is fine as it is...
2,00013b17ad220c46,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,00017563c3f7919a,":If you have a look back at the source, the in..."
4,00017695ad8997eb,I don't anonymously edit articles at all.
...,...,...
153159,fffcd0960ee309b5,". \n i totally agree, this stuff is nothing bu..."
153160,fffd7a9a6eb32c16,== Throw from out field to home plate. == \n\n...
153161,fffda9e8d6fafa9e,""" \n\n == Okinotorishima categories == \n\n I ..."
153162,fffe8f1340a79fc2,""" \n\n == """"One of the founding nations of the..."


In [69]:
test_dataset['comment_text'] = test_dataset['comment_text'].apply(preprocess_text)

In [70]:
test_dataset

Unnamed: 0,id,comment_text
0,00001cee341fdb12,yo bitch ja rule is more succesful then youll ...
1,0000247867823ef7,from rfc the title is fine as it is imo
2,00013b17ad220c46,sources zawe ashton on lapland
3,00017563c3f7919a,if you have a look back at the source the info...
4,00017695ad8997eb,i dont anonymously edit articles at all
...,...,...
153159,fffcd0960ee309b5,i totally agree this stuff is nothing but to...
153160,fffd7a9a6eb32c16,throw from out field to home plate does it ...
153161,fffda9e8d6fafa9e,okinotorishima categories i see your chan...
153162,fffe8f1340a79fc2,one of the founding nations of the eu germ...


## X_test, Y-test

In [71]:
X_test = test_dataset['comment_text']

In [72]:
X_test

0         yo bitch ja rule is more succesful then youll ...
1                 from rfc   the title is fine as it is imo
2                    sources    zawe ashton on lapland     
3         if you have a look back at the source the info...
4                   i dont anonymously edit articles at all
                                ...                        
153159      i totally agree this stuff is nothing but to...
153160     throw from out field to home plate   does it ...
153161       okinotorishima categories   i see your chan...
153162       one of the founding nations of the eu  germ...
153163      stop already your bullshit is not welcome he...
Name: comment_text, Length: 153164, dtype: object

In [73]:
test_label = pd.read_csv('/content/test_labels.csv')

In [74]:
y_test = test_label[["toxic", "severe_toxic", "obscene", "threat", "insult", "identity_hate"]]

In [75]:
y_test.sample(5)

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
107776,-1,-1,-1,-1,-1,-1
60600,0,0,0,0,0,0
90065,0,0,0,0,0,0
113186,0,0,0,0,0,0
103046,0,0,0,0,0,0


## Tfidf Vectorization

In [76]:
vectorizer = TfidfVectorizer()
X_train_vectors = vectorizer.fit_transform(X_train)
X_test_vectors = vectorizer.transform(X_test)

## Model Building and fitting

In [78]:
model = MultiOutputClassifier(LogisticRegression())
model.fit(X_train_vectors, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

## Prediction

In [81]:
y_pred = model.predict(X_test_vectors)

In [82]:
y_pred

array([[1, 0, 1, 0, 1, 0],
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0],
       [1, 0, 1, 0, 0, 0]])

## Accuracy

In [85]:
accuracy = []
for i, label in enumerate(y_test.columns):
    acc = accuracy_score(y_test[label], y_pred[:, i])
    accuracy.append(acc)
    print(f"Accuracy for {label}: {acc}")

Accuracy for toxic: 0.39161291165025724
Accuracy for severe_toxic: 0.4149277898200622
Accuracy for obscene: 0.40393956804471026
Accuracy for threat: 0.4164294481732
Accuracy for insult: 0.4023856780966807
Accuracy for identity_hate: 0.41361547099840695


## Classification Report

In [86]:
for i, label in enumerate(y_test.columns):
    print(f"\nClassification Report for {label}:")
    report = classification_report(y_test[label], y_pred[:, i])
    print(report)


Classification Report for toxic:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     89186
           0       0.43      0.97      0.60     57888
           1       0.17      0.67      0.27      6090

    accuracy                           0.39    153164
   macro avg       0.20      0.55      0.29    153164
weighted avg       0.17      0.39      0.24    153164


Classification Report for severe_toxic:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     89186
           0       0.42      1.00      0.59     63611
           1       0.10      0.29      0.14       367

    accuracy                           0.41    153164
   macro avg       0.17      0.43      0.24    153164
weighted avg       0.17      0.41      0.24    153164


Classification Report for obscene:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     89186
           0       0.43      0.99      0.60     60287
           1       0.17      0.60      0.26      3691

    accuracy                           0.40    153164
   macro avg       0.20      0.53      0.29    153164
weighted avg       0.17      0.40      0.24    153164


Classification Report for threat:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     89186
           0       0.42      1.00      0.59     63767
           1       0.23      0.23      0.23       211

    accuracy                           0.42    153164
   macro avg       0.22      0.41      0.27    153164
weighted avg       0.17      0.42      0.25    153164


Classification Report for insult:


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     89186
           0       0.42      0.99      0.59     60551
           1       0.17      0.50      0.25      3427

    accuracy                           0.40    153164
   macro avg       0.20      0.50      0.28    153164
weighted avg       0.17      0.40      0.24    153164


Classification Report for identity_hate:
              precision    recall  f1-score   support

          -1       0.00      0.00      0.00     89186
           0       0.41      1.00      0.59     63266
           1       0.18      0.22      0.20       712

    accuracy                           0.41    153164
   macro avg       0.20      0.41      0.26    153164
weighted avg       0.17      0.41      0.24    153164



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
