# Import Packages

In [None]:
#Author Pragati Shinde

In [1]:
import pandas as pd
import numpy as np
import joblib
# import and instantiate TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

import re
import pickle

# Step 1: Read NEW data from slack

In [2]:
new_data = pd.read_csv('test.csv') # here use slack data in CSV format
# Sample Data: preidct for 1 comments (delete after demo)
new_data = new_data.head(1)
new_data

Unnamed: 0,id,comment_text
0,00001cee341fdb12,Yo bitch Ja Rule is more succesful then you'll...


# Step 2: Remove STOP words

In [3]:
def remove_stop_words(text):
    text = text.lower()
    text = re.sub(r"what's", "what is ", text)
    text = re.sub(r"\'s", " ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", "cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"i'm", "i am ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r"\'scuse", " excuse ", text)
    text = re.sub('\W', ' ', text)
    text = re.sub('\s+', ' ', text)
    text = text.strip(' ')
    return text

In [4]:
# clean the comment_text
new_data['comment_text'] = new_data['comment_text'].map(lambda c : remove_stop_words(c))

In [5]:
new_comment = new_data.comment_text
print('Comment is:', new_comment)


Comment is: 0    yo bitch ja rule is more succesful then you wi...
Name: comment_text, dtype: object


# Step 3: Import TF-IDF

In [6]:
# load fitted TF-IDF vector using training date
tfidf_vect = joblib.load('tfidf.pkl')


In [7]:
# step 2: transform data TF-IDF
new_comment_tfidf = tfidf_vect.transform(new_comment)
new_comment_tfidf

<1x5000 sparse matrix of type '<class 'numpy.float64'>'
	with 25 stored elements in Compressed Sparse Row format>

# Step 4: Load Trained Model & Predict

In [8]:
# use predefined labels & model_list for prediction
fixed_colms = ['obscene','insult','toxic','severe_toxic','identity_hate','threat']
model_list = ['obscene_model.pkl', 'insult_model.pkl','toxic_model.pkl', 'severe_toxic_model.pkl', 'identity_hate_model.pkl', 'threat_model.pkl']
predictions_6 = []


In [9]:
# check sample loaded model
loaded_model = pickle.load(open('obscene_model.pkl', 'rb'))
loaded_model

LogisticRegression(C=12.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [10]:
# Make 6 predictions
for model in model_list:
    print('Lets predict for {} model'.format(model))
    # loading model
    loaded_model = pickle.load(open(model, 'rb'))
    # predict
    predict = loaded_model.predict_proba(new_comment_tfidf)[:,1]
    predictions_6.append(predict)

print('Final predictions:', predictions_6)


Lets predict for obscene_model.pkl model
Lets predict for insult_model.pkl model
Lets predict for toxic_model.pkl model
Lets predict for severe_toxic_model.pkl model
Lets predict for identity_hate_model.pkl model
Lets predict for threat_model.pkl model
Final predictions: [array([0.99996524]), array([0.97225816]), array([0.99995763]), array([0.46324992]), array([0.4407759]), array([0.05012353])]


# Step 5: Check Predictions

In [14]:
new_data['comment_text']

0    yo bitch ja rule is more succesful then you wi...
Name: comment_text, dtype: object

In [11]:
for label in fixed_colms:
    print(label)

obscene
insult
toxic
severe_toxic
identity_hate
threat


In [12]:
import itertools
predictions_6 = list(itertools.chain(*predictions_6))

for pred in predictions_6:
    print(pred)

0.9999652443509109
0.9722581621919304
0.9999576268583528
0.4632499178897188
0.4407759020418848
0.05012352988532971
