# Step1_Train

### Import Section

In [1]:
import numpy as np
import pandas as pd


from sklearn.model_selection import train_test_split

from sklearn.base import BaseEstimator

from sklearn.pipeline import Pipeline

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import SGDClassifier

import dill

### Global Settings Section

In [2]:
random_state_global = 0

### Path Section

In [3]:
PATH_DATA = r'train.csv'
PATH_MODEL = r'model.dill'

## Model Building

In [4]:
df_data = pd.read_csv(PATH_DATA)

In [5]:
df_data.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [6]:
X = df_data[['comment_text']]
y = df_data[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].max(axis=1)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=random_state_global, shuffle=y)

In [7]:
class FeatureSelector(BaseEstimator):
    def __init__(self, list_features):
        self.list_features = list_features
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        return X[self.list_features]

In [8]:
pipeline_model = Pipeline([
    ('feature_selector', FeatureSelector('comment_text')),
    ('vectorizer', TfidfVectorizer(strip_accents='unicode', stop_words='english', max_features=5000)),
    ('classifier', SGDClassifier(random_state=random_state_global))
])

In [9]:
dict_model = {
    'model': pipeline_model,
    'var': '1.00'
}

dict_model

{'model': Pipeline(steps=[('feature_selector',
                  FeatureSelector(list_features='comment_text')),
                 ('vectorizer',
                  TfidfVectorizer(max_features=5000, stop_words='english',
                                  strip_accents='unicode')),
                 ('classifier', SGDClassifier(random_state=0))]),
 'var': '1.00'}

In [10]:
%%time

dict_model['model'].fit(X_train, y_train)

Wall time: 4.16 s


Pipeline(steps=[('feature_selector',
                 FeatureSelector(list_features='comment_text')),
                ('vectorizer',
                 TfidfVectorizer(max_features=5000, stop_words='english',
                                 strip_accents='unicode')),
                ('classifier', SGDClassifier(random_state=0))])

In [11]:
with open(PATH_MODEL, mode='wb') as file:
    dill.dump(obj=dict_model, file=file)