# Tweet Insult Predictor

# Load Data

In [None]:
import pandas as pd

In [None]:
train = pd.read_csv('./data/insult_train.csv')

In [None]:
train.sample(10)

# Train Pipeline

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier(loss='log',max_iter=1000, tol=1e-3)),
    ])

In [None]:
import numpy as np

parameters = {
    'vect__min_df': (
        5,
        10,
        15,
    ),
    'vect__max_df': (
        0.5, 
        0.75,
        1.0,
    ),
    'clf__alpha': np.logspace(-5,3,3),
    }

In [None]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1,return_train_score=True)

grid_search.fit(train.Comment, train.Insult )

In [22]:
# save model
from sklearn.externals import joblib
joblib.dump(grid_search.best_estimator_, './output/model.pkl')

['./output/model.pkl']

# Using a model

In [23]:
# load model
model = joblib.load('./output/model.pkl')

## Predict in test set

In [24]:
test = pd.read_csv('./data/insult_test.csv')

# predict
preds = model.predict_proba(test.Comment)
test['Insult_proba'] = preds[:,1] #insult prob is in second column

# print score
test_score = model.score(test.Comment, test.Insult)
print(f'Score on test set: {test_score:.2f}')

# save preds
test.to_csv('./output/insult_test_pred.csv')


Score on test set: 0.81


## Get word sentiment

In [None]:
words = model.named_steps['vect'].get_feature_names() # all words
coefs = model.named_steps['clf'].coef_.reshape(-1) # coefficent per word
word_sentiment = pd.DataFrame({'sentiment':coefs*-1,},index=words) #sentiment is low when insult coefficient is high
word_sentiment.to_csv('./output/word_sentiment.csv')
word_sentiment.sort_values(by='sentiment').head(10)

## Sentence predictor

In [None]:
sentence = 'Your mother was a hamster and your father smelt like elderberries'

sen_list = [sentence] # model expects a list of sentences
probs = model.predict_proba(sen_list)
insult_prob = probs[0,1] # row 0 col 1 contains the insult prob
print(f'Insult probability: {insult_prob:.2f}')