# Tweet Insult Predictor

# Load Data

In [32]:
import pandas as pd

In [33]:
train = pd.read_csv('../data/insult_train.csv')

In [9]:
train.sample(10)

Unnamed: 0,Insult,Date,Comment
472,0,20120529223518Z,"""Just like Huff Bochy thinks Theriot a player..."
3125,0,20120619001525Z,"""I hope they hurt that fucking pig."""
2992,0,,"""MASSIVE PISSTAK*"""
3055,0,20120529031217Z,"""Animal Planet told that Mermaids are real."""
2374,0,20120619040445Z,"""Franklin street #fuckwitme"""
809,0,20120618215525Z,"""May he eat shit and <i>live</i>."""
1496,1,20120612123016Z,"""Maybe you should read that\xa0first sentence ..."
2875,0,20120529034509Z,"""A few Thunder fans here don't seem to want to..."
3843,0,20120619184505Z,"""LeBron, no matter who says what, you are an i..."
1504,0,,"""I\\'m single, and but when I haven\\'t been, ..."


# Train Pipeline

In [14]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('clf', SGDClassifier(loss='log',max_iter=1000, tol=1e-3)),
    ])

In [19]:
import numpy as np

parameters = {
    'vect__min_df': (
        5,
        10,
        15,
    ),
    'vect__max_df': (
        0.5, 
        0.75,
        1.0,
    ),
    'clf__alpha': np.logspace(-5,3,3),
    }

In [20]:
from sklearn.model_selection import GridSearchCV

grid_search = GridSearchCV(pipeline, parameters, cv=5,
                               n_jobs=-1, verbose=1,return_train_score=True)

grid_search.fit(train.Comment, train.Insult )

Fitting 5 folds for each of 27 candidates, totalling 135 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    3.0s
[Parallel(n_jobs=-1)]: Done 135 out of 135 | elapsed:    6.4s finished


GridSearchCV(cv=5, error_score='raise-deprecating',
       estimator=Pipeline(memory=None,
     steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip..._state=None, shuffle=True, tol=0.001,
       validation_fraction=0.1, verbose=0, warm_start=False))]),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'vect__min_df': (5, 10, 15), 'vect__max_df': (0.5, 0.75, 1.0), 'clf__alpha': array([1.e-05, 1.e-01, 1.e+03])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring=None, verbose=1)

In [None]:
# save model
from sklearn.externals import joblib
joblib.dump(grid_search.best_estimator_, '../output/model.pkl')

# Using a model

In [23]:
# load model
model = joblib.load('../output/model.pkl')

## Predict in test set

In [31]:
test = pd.read_csv('../data/insult_test.csv')

# predict
preds = model.predict_proba(test.Comment)
test['Insult_proba'] = preds[:,1] #insult prob is in second column

# print score
test_score = model.score(test.Comment, test.Insult)
print(f'Score on test set: {test_score:.2f}')

# save preds
test.to_csv('../output/insult_test_pred.csv')

Score on test set: 0.81


## Get word sentiment

In [25]:
words = model.named_steps['vect'].get_feature_names() # all words
coefs = model.named_steps['clf'].coef_.reshape(-1) # coefficent per word
word_sentiment = pd.DataFrame({'sentiment':coefs*-1,},index=words) #sentiment is low when insult coefficient is high
word_sentiment.to_csv('../output/word_sentiment.csv')
word_sentiment.sort_values(by='sentiment').head(10)

Unnamed: 0,sentiment
asshole,-33.527743
dumb,-33.516357
faggot,-32.661423
moron,-29.96059
loser,-28.423332
sister,-28.201897
moronic,-26.717284
idiot,-25.894837
greedy,-24.539291
retards,-24.124035


## Sentence predictor

In [34]:
sentence = 'Your mother was a hamster and your father smelt like elderberries'

sen_list = [sentence] # model expects a list of sentences
probs = model.predict_proba(sen_list)
insult_prob = probs[0,1] # row 0 col 1 contains the insult prob
print(f'Insult probability: {insult_prob:.2f}')

Insult probability: 1.00
