## Imports

In [1]:
import pandas as pd
import dill
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

## Reading the cleaned dataset

In [2]:
data = pd.read_csv('../data/data_processed.csv')

## Vectorize and save TF-IDF vectorizer

In [3]:
vectorizer = TfidfVectorizer(max_features=4096, stop_words='english')

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 7 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   comment_text   159564 non-null  object
 1   toxic          159571 non-null  int64 
 2   severe_toxic   159571 non-null  int64 
 3   obscene        159571 non-null  int64 
 4   threat         159571 non-null  int64 
 5   insult         159571 non-null  int64 
 6   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(1)
memory usage: 8.5+ MB


In [5]:
data = data.dropna()

In [6]:
X = data['comment_text']
targets = data.drop('comment_text', axis=1)

In [7]:
tfidf_data = vectorizer.fit_transform(X)
tfidf_data

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 2982063 stored elements and shape (159564, 4096)>

In [8]:
with open('../models/tf-idf_vectorizer.pkl', 'wb') as f:
    dill.dump(vectorizer, f)

## Model training

In [9]:
models = {}

for target in targets.columns:
    model = LogisticRegression(class_weight='balanced', max_iter=500, C=1.6)
    print(f'\nTraining model for: {target}')
    y = data[target]
    model.fit(tfidf_data, y)
    models[target] = model  
    y_pred = model.predict(tfidf_data)
    print(f'Training Accuracy: {accuracy_score(y, y_pred)}')


Training model for: toxic
Training Accuracy: 0.9278283322052593

Training model for: severe_toxic
Training Accuracy: 0.9689528966433532

Training model for: obscene
Training Accuracy: 0.9638013587024642

Training model for: threat
Training Accuracy: 0.9857925346569402

Training model for: insult
Training Accuracy: 0.9467047704996114

Training model for: identity_hate
Training Accuracy: 0.9599345717079041


In [10]:
models

{'toxic': LogisticRegression(C=1.6, class_weight='balanced', max_iter=500),
 'severe_toxic': LogisticRegression(C=1.6, class_weight='balanced', max_iter=500),
 'obscene': LogisticRegression(C=1.6, class_weight='balanced', max_iter=500),
 'threat': LogisticRegression(C=1.6, class_weight='balanced', max_iter=500),
 'insult': LogisticRegression(C=1.6, class_weight='balanced', max_iter=500),
 'identity_hate': LogisticRegression(C=1.6, class_weight='balanced', max_iter=500)}

## Save the classifier model

In [11]:
with open('../models/classifier.pkl', 'wb') as f:
    dill.dump(models, f)