In [2]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LinearRegression
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import f1_score, roc_auc_score

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from joblib import dump, load


f_path = "/kaggle/input/d/miracl/kmaml223/"

In [3]:
train = pd.read_csv(f_path + "train.csv")
print(f"Train shape: {train.shape}")
print(f"Train columns: {train.columns}")
test = pd.read_csv(f_path + "test.csv")
print(f"Test shape: {test.shape}")
print(f"Test columns: {test.columns}")

Train shape: (159571, 8)
Train columns: Index(['id', 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat',
       'insult', 'identity_hate'],
      dtype='object')
Test shape: (63978, 3)
Test columns: Index(['Unnamed: 0', 'id', 'comment_text'], dtype='object')


In [4]:
with_labels = train[(train['toxic'] == 1) | (train['severe_toxic'] == 1) | (train['obscene'] == 1) | (train['threat'] == 1) | (train['insult'] == 1) | (train['identity_hate'] == 1)]
sample_without_labels = train[(train['toxic'] == 0) & (train['severe_toxic'] == 0) & (train['obscene'] == 0) & (train['threat'] == 0) & (train['insult'] == 0) & (train['identity_hate'] == 0)].sample(10000)
train_sample = pd.concat([with_labels, sample_without_labels]).sample(frac=1)
train_sample.shape

(26225, 8)

In [5]:
train_sample.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
55127,93466b5c40415017,"""\n\nHello there my name is and i was wonderin...",0,0,0,0,0,0
17934,2f5a80474f66d0cc,What?? Your told me I could argue that the boo...,0,0,0,0,0,0
100604,1a7fe52027af85c8,"Oh, guess what? This is all fucking pointless....",1,0,1,0,0,0
109461,4973127e15435e24,here's your fucking citation \n\nhttps://mobi...,1,0,1,0,0,0
24791,419638cd6fa729bf,"OMG WTF \n\nTook MY NAME OUT, WTH??!?!?!? ......",1,0,1,0,1,0


In [6]:
train_texts = [text for text in train_sample['comment_text']]

In [7]:
def clean_text(text):
    cleaned_text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return cleaned_text

In [8]:
cleaned_train_texts = [clean_text(text) for text in train_texts]

In [9]:
def tokenize_text(text):
    words = word_tokenize(text)
    words = [word.lower() for word in words if word.isalpha() and word.lower() not in stop_words]
    return ' '.join(words)

In [10]:
stop_words = set(stopwords.words('english'))

In [11]:
tokenized_train_texts = [tokenize_text(text) for text in cleaned_train_texts]

In [12]:
vectorizer = TfidfVectorizer() 
X_vectorized = vectorizer.fit_transform(tokenized_train_texts)

train_val_split = 0.9
split = int(X_vectorized.shape[0]*train_val_split)

X_train = X_vectorized[:split, :]
X_test = X_vectorized[split:, :]

In [13]:
Y_train = train_sample[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']][:split]
Y_test = train_sample[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']][split:]

In [14]:
Y_train.head()

Unnamed: 0,toxic,severe_toxic,obscene,threat,insult,identity_hate
55127,0,0,0,0,0,0
17934,0,0,0,0,0,0
100604,1,0,1,0,0,0
109461,1,0,1,0,0,0
24791,1,0,1,0,1,0


In [15]:
log_models = []
for col in Y_train.columns:
    model = LogisticRegression(max_iter = 5000)
    model.fit(X_train, Y_train[col])
    log_models.append(model)
    Y_pred = model.predict(X_test)

    roc_auc = roc_auc_score(Y_test[col], Y_pred)
    print(col)
    print(f"Roc auc score: {roc_auc}")

toxic
Roc auc score: 0.8664189318415453
severe_toxic
Roc auc score: 0.5809012295635726
obscene
Roc auc score: 0.8187356284371211
threat
Roc auc score: 0.5375
insult
Roc auc score: 0.7421141469493666
identity_hate
Roc auc score: 0.5871866436618289


In [16]:
log_models = []
log_rocs = []
for col in Y_train.columns:
    best_model = None
    best_roc_auc = 0
    for C in [10**i for i in range(-1, 4)]:
        model = LogisticRegression(max_iter = 1000, C = C)
        model.fit(X_train, Y_train[col])
        Y_pred = model.predict(X_test)

        roc_auc = roc_auc_score(Y_test[col], Y_pred)

        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_model = model

        print(f"{col}, C: {C}")
        print(f"Roc auc score: {roc_auc}")
    log_models.append(best_model)
    log_rocs.append(best_roc_auc)
print("\n" + str(np.mean(log_rocs)))

toxic, C: 0.1
Roc auc score: 0.8138229458031508
toxic, C: 1
Roc auc score: 0.8664189318415453
toxic, C: 10
Roc auc score: 0.8680326448157977
toxic, C: 100
Roc auc score: 0.8563968777642306
toxic, C: 1000
Roc auc score: 0.8364959119271319
severe_toxic, C: 0.1
Roc auc score: 0.5248303243844387
severe_toxic, C: 1
Roc auc score: 0.5809012295635726
severe_toxic, C: 10
Roc auc score: 0.6049208526914244
severe_toxic, C: 100
Roc auc score: 0.6082909793894796
severe_toxic, C: 1000
Roc auc score: 0.5863877542535831
obscene, C: 0.1
Roc auc score: 0.7386059736806005
obscene, C: 1
Roc auc score: 0.8187356284371211
obscene, C: 10
Roc auc score: 0.8391906153100183
obscene, C: 100
Roc auc score: 0.8231898791600284
obscene, C: 1000
Roc auc score: 0.7941982009146187
threat, C: 0.1
Roc auc score: 0.5
threat, C: 1
Roc auc score: 0.5375
threat, C: 10
Roc auc score: 0.5996128532713899
threat, C: 100
Roc auc score: 0.61211285327139
threat, C: 1000
Roc auc score: 0.5986449864498645
insult, C: 0.1
Roc auc scor

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


insult, C: 1000
Roc auc score: 0.7003158893555032
identity_hate, C: 0.1
Roc auc score: 0.5169491525423728
identity_hate, C: 1
Roc auc score: 0.5871866436618289
identity_hate, C: 10
Roc auc score: 0.6231266280997327
identity_hate, C: 100
Roc auc score: 0.6180909367705267
identity_hate, C: 1000
Roc auc score: 0.6106600358604825

0.7167578479276006


In [17]:
for model, col in zip(log_models, Y_train.columns):
    dump(model, f'LR_{col}.joblib')

In [18]:
svc_models = []
svc_rocs = []
for col in Y_train.columns:
    best_model = None
    best_roc_auc = 0
    for C in [10**i for i in range(-1, 4)]:
        model = SVC(kernel='linear', max_iter = 1000, C = C)
        model.fit(X_train, Y_train[col])
        Y_pred = model.predict(X_test)

        roc_auc = roc_auc_score(Y_test[col], Y_pred)

        if roc_auc > best_roc_auc:
            best_roc_auc = roc_auc
            best_model = model

        print(f"{col}, C: {C}")
        print(f"Roc auc score: {roc_auc}")
    svc_models.append(best_model)
    svc_rocs.append(best_roc_auc)
print("\n" + str(np.mean(svc_rocs)))



toxic, C: 0.1
Roc auc score: 0.6941036720935235




toxic, C: 1
Roc auc score: 0.788559372684023




toxic, C: 10
Roc auc score: 0.7649543378995434




toxic, C: 100
Roc auc score: 0.73865828731263




toxic, C: 1000
Roc auc score: 0.7312615051758349




severe_toxic, C: 0.1
Roc auc score: 0.7754955151590741




severe_toxic, C: 1
Roc auc score: 0.6589792959371396




severe_toxic, C: 10
Roc auc score: 0.615045523993639




severe_toxic, C: 100
Roc auc score: 0.5600776402357269




severe_toxic, C: 1000
Roc auc score: 0.6469460987600428




obscene, C: 0.1
Roc auc score: 0.7501052612992912




obscene, C: 1
Roc auc score: 0.7976708051334918




obscene, C: 10
Roc auc score: 0.7600789391834167




obscene, C: 100
Roc auc score: 0.7389180225001121




obscene, C: 1000
Roc auc score: 0.6896604881679509




threat, C: 0.1
Roc auc score: 0.5




threat, C: 1
Roc auc score: 0.5742257065427797




threat, C: 10
Roc auc score: 0.6980642663569492




threat, C: 100
Roc auc score: 0.6465156794425087




threat, C: 1000
Roc auc score: 0.6703542392566783




insult, C: 0.1
Roc auc score: 0.6834989245109085




insult, C: 1
Roc auc score: 0.6957166086243982




insult, C: 10
Roc auc score: 0.6300246395119897




insult, C: 100
Roc auc score: 0.6276403541716459




insult, C: 1000
Roc auc score: 0.588065983247408




identity_hate, C: 0.1
Roc auc score: 0.7725954869921176




identity_hate, C: 1
Roc auc score: 0.6578690077472176




identity_hate, C: 10
Roc auc score: 0.6754000473629013




identity_hate, C: 100
Roc auc score: 0.6510199939104842




identity_hate, C: 1000
Roc auc score: 0.6509692479447884

0.754683675825009


In [19]:
for model, col in zip(svc_models, Y_train.columns):
    dump(model, f'SVC_{col}.joblib')

In [20]:
import os
log_models = {}
for file in os.listdir('/kaggle/working/'):
  if "SVC" in file:
    log_models[file.split('.')[0][4:]] = load('/kaggle/working/' + file)

In [21]:
print(log_models)

{'toxic': SVC(C=1, kernel='linear', max_iter=1000), 'threat': SVC(C=10, kernel='linear', max_iter=1000), 'identity_hate': SVC(C=0.1, kernel='linear', max_iter=1000), 'obscene': SVC(C=1, kernel='linear', max_iter=1000), 'insult': SVC(C=1, kernel='linear', max_iter=1000), 'severe_toxic': SVC(C=0.1, kernel='linear', max_iter=1000)}


In [22]:
vocabulary_dict = vectorizer.vocabulary_
vectorizer2 = TfidfVectorizer(vocabulary=vocabulary_dict)

In [23]:
test_texts = [text for text in test['comment_text']]
cleaned_test_texts = [clean_text(text) for text in test_texts]
tokenized_test_texts = [tokenize_text(text) for text in cleaned_test_texts]
test_vector = vectorizer2.fit_transform(tokenized_test_texts)

In [24]:
sb = pd.read_csv(f_path + "sample_submission.csv")
print(f"Test shape: {sb.shape}")
print(f"Test columns: {sb.columns}")

Test shape: (63978, 7)
Test columns: Index(['id', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult',
       'identity_hate'],
      dtype='object')


In [25]:
for key, model in log_models.items():
    sb[key] = model.predict(test_vector)
sb.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0001ea8717f6de06,0,0,0,0,0,0
1,000247e83dcc1211,1,0,0,0,0,0
2,0002f87b16116a7f,0,0,0,0,0,0
3,0003e1cccfd5a40a,0,0,0,0,0,0
4,00059ace3e3e9a53,0,0,0,0,0,0


In [26]:
sb.to_csv("/kaggle/working/submit.csv", index = False)

In [27]:
list(vocabulary_dict.keys())

['hello',
 'name',
 'wondering',
 'would',
 'kind',
 'enough',
 'unblock',
 'know',
 'must',
 'hear',
 'cliche',
 'im',
 'sorry',
 'wont',
 'routine',
 'alot',
 'repeat',
 'really',
 'hope',
 'automatically',
 'become',
 'best',
 'friend',
 'thankssssss',
 'told',
 'could',
 'argue',
 'book',
 'reliable',
 'source',
 'selfpublished',
 'nonacademic',
 'reverted',
 'explanation',
 'woodzing',
 'restored',
 'reference',
 'think',
 'violation',
 'editing',
 'restrictions',
 'anyway',
 'report',
 'added',
 'information',
 'scholarly',
 'greatly',
 'improved',
 'article',
 'oh',
 'guess',
 'fucking',
 'pointless',
 'convicted',
 'life',
 'without',
 'parole',
 'took',
 'seconds',
 'find',
 'whining',
 'bitching',
 'teeth',
 'gnashing',
 'blp',
 'never',
 'bothered',
 'look',
 'see',
 'already',
 'gone',
 'trial',
 'sentence',
 'please',
 'put',
 'bullshit',
 'rest',
 'heres',
 'citation',
 'happy',
 'omg',
 'wtf',
 'wth',
 'guys',
 'suck',
 'appropriate',
 'talk',
 'dipshit',
 'come',
 'hous

In [28]:
with open("vocabulary.txt", "w") as f:
  f.write(str(list(vocabulary_dict.keys())))