## This notebook is used to generate the finalized version of the classifier, to simply feature transformation into the final form, and to test that the results are the same

Most of the code comes from operational_classifier.

In [1]:
import pandas as pd
import numpy as np
import pickle
import sys
#reload(sys)
#sys.setdefaultencoding("utf-8")

#Loading raw data
df = pd.read_pickle("../data/labeled_data.p")
tweets = df.tweet

In [2]:
df

Unnamed: 0,count,hate_speech,offensive_language,neither,class,tweet
0,3,0,0,3,2,!!! RT @mayasolovely: As a woman you shouldn't...
1,3,0,3,0,1,!!!!! RT @mleew17: boy dats cold...tyga dwn ba...
2,3,0,3,0,1,!!!!!!! RT @UrKindOfBrand Dawg!!!! RT @80sbaby...
3,3,0,2,1,1,!!!!!!!!! RT @C_G_Anderson: @viva_based she lo...
4,6,0,6,0,1,!!!!!!!!!!!!! RT @ShenikaRoberts: The shit you...
5,3,1,2,0,1,"!!!!!!!!!!!!!!!!!!""@T_Madison_x: The shit just..."
6,3,0,3,0,1,"!!!!!!""@__BrighterDays: I can not just sit up ..."
7,3,0,3,0,1,!!!!&#8220;@selfiequeenbri: cause I'm tired of...
8,3,0,3,0,1,""" &amp; you might not get ya bitch back &amp; ..."
9,3,1,2,0,1,""" @rhythmixx_ :hobbies include: fighting Maria..."


## Feature generation

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
from nltk.stem.porter import *
import string
import re


stopwords = nltk.corpus.stopwords.words("english")

other_exclusions = ["#ff", "ff", "rt"]
stopwords.extend(other_exclusions)


stemmer = PorterStemmer()


def preprocess(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    #parsed_text = parsed_text.code("utf-8", errors='ignore')
    return parsed_text

def tokenize(tweet):
    """Removes punctuation & excess whitespace, sets to lowercase,
    and stems tweets. Returns a list of stemmed tokens."""
    tweet = " ".join(re.split("[^a-zA-Z]+", tweet.lower())).strip()
    #tokens = re.split("[^a-zA-Z]*", tweet.lower())
    tokens = [stemmer.stem(t) for t in tweet.split()]
    return tokens

def basic_tokenize(tweet):
    """Same as tokenize but without the stemming"""
    tweet = " ".join(re.split("[^a-zA-Z.,!?]+", tweet.lower())).strip()
    return tweet.split()

tokenized_stopwords = tokenize(' '.join(stopwords)) + ["becau"]

vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=tokenized_stopwords, #We do better when we keep stopwords
    use_idf=True,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=10000,
    min_df=5,
    max_df=0.501
    )

In [4]:
#Construct tfidf matrix and get relevant scores
tfidf = vectorizer.fit_transform(tweets).toarray()
vocab = {v:i for i, v in enumerate(vectorizer.get_feature_names())}
idf_vals = vectorizer.idf_
idf_dict = {i:idf_vals[i] for i in vocab.values()} #keys are indices; values are IDF scores

  'stop_words.' % sorted(inconsistent))


In [5]:
#Get POS tags for tweets and save as a string
tweet_tags = []
for t in tweets:
    tokens = basic_tokenize(preprocess(t))
    tags = nltk.pos_tag(tokens)
    tag_list = [x[1] for x in tags]
    #for i in range(0, len(tokens)):
    tag_str = " ".join(tag_list)
    tweet_tags.append(tag_str)
        #print(tokens[i],tag_list[i])

In [6]:
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    max_features=5000,
    min_df=5,
    max_df=0.501,
    )

In [7]:
#Construct POS TF matrix and get vocab dict
pos = pos_vectorizer.fit_transform(pd.Series(tweet_tags)).toarray()
pos_vocab = {v:i for i, v in enumerate(pos_vectorizer.get_feature_names())}

In [8]:
#Now get other features
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer as VS
from textstat.textstat import *

sentiment_analyzer = VS()

def count_twitter_objs(text_string):
    """
    Accepts a text string and replaces:
    1) urls with URLHERE
    2) lots of whitespace with one instance
    3) mentions with MENTIONHERE
    4) hashtags with HASHTAGHERE

    This allows us to get standardized counts of urls and mentions
    Without caring about specific people mentioned.
    
    Returns counts of urls, mentions, and hashtags.
    """
    space_pattern = '\s+'
    giant_url_regex = ('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|'
        '[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+')
    mention_regex = '@[\w\-]+'
    hashtag_regex = '#[\w\-]+'
    parsed_text = re.sub(space_pattern, ' ', text_string)
    parsed_text = re.sub(giant_url_regex, 'URLHERE', parsed_text)
    parsed_text = re.sub(mention_regex, 'MENTIONHERE', parsed_text)
    parsed_text = re.sub(hashtag_regex, 'HASHTAGHERE', parsed_text)
    return(parsed_text.count('URLHERE'),parsed_text.count('MENTIONHERE'),parsed_text.count('HASHTAGHERE'))

def other_features(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    retweet = 0
    if "rt" in words:
        retweet = 1
    features = [FKRA, FRE,syllables, avg_syl, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['neg'], sentiment['pos'], sentiment['neu'], sentiment['compound'],
                twitter_objs[2], twitter_objs[1],
                twitter_objs[0], retweet]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features(t))
    return np.array(feats)

In [9]:
other_features_names = ["FKRA", "FRE","num_syllables", "avg_syl_per_word", "num_chars", "num_chars_total", \
                        "num_terms", "num_words", "num_unique_words", "vader neg","vader pos","vader neu", "vader compound", \
                        "num_hashtags", "num_mentions", "num_urls", "is_retweet"]

In [10]:
feats = get_feature_array(tweets)

In [11]:
#Now join them all up
M = np.concatenate([tfidf,pos,feats],axis=1)

In [12]:
M.shape

(24783L, 11166L)

In [13]:
#Finally get a list of variable names
variables = ['']*len(vocab)
for k,v in vocab.iteritems():
    variables[v] = k

pos_variables = ['']*len(pos_vocab)
for k,v in pos_vocab.iteritems():
    pos_variables[v] = k

feature_names = variables+pos_variables+other_features_names

# Running the model

This model was found using a GridSearch with 5-fold cross validation. Details are in the notebook operational_classifier.

In [14]:
X = pd.DataFrame(M)
y = df['class'].astype(int)

In [15]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.metrics import classification_report
from sklearn.svm import LinearSVC

In [16]:
select = SelectFromModel(LogisticRegression(class_weight='balanced',penalty="l1",C=0.01))
X_ = select.fit_transform(X,y)



In [17]:
model = LinearSVC(class_weight='balanced',C=0.01, penalty='l2', loss='squared_hinge',multi_class='ovr').fit(X_, y)



In [18]:
y_preds = model.predict(X_)

In [19]:
report = classification_report( y, y_preds )

In [20]:
print(report)

              precision    recall  f1-score   support

           0       0.45      0.58      0.51      1430
           1       0.97      0.91      0.94     19190
           2       0.83      0.96      0.89      4163

   micro avg       0.90      0.90      0.90     24783
   macro avg       0.75      0.82      0.78     24783
weighted avg       0.91      0.90      0.91     24783



# Using information from the model to obtain the matrix X_ generically

This is the most difficult task: We have to take the inputs tweets and transform them into a format that can be used in the model without going through all the same pre-processing steps as above. This can be done as follows.

## Obtaining information about the model

In [21]:
final_features = select.get_support(indices=True) #get indices of features
final_feature_list = [unicode(feature_names[i]) for i in final_features] #Get list of names corresponding to indices

In [22]:
print final_feature_list

[u'america', u'american', u'anoth', u'ass', u'ass cracker', u'ass hoe', u'ass nigga', u'bad', u'beaner', u'big', u'bird', u'bitch', u'bitch nigga', u'black', u'border', u'born', u'bout', u'browni', u'campu', u'charli', u'chink', u'color', u'color folk', u'coon', u'countri', u'cracker', u'crazi', u'crippl', u'cunt', u'da', u'damn', u'darki', u'dick', u'die', u'doe', u'dyke', u'fag', u'faggot', u'fat', u'femal', u'feminist', u'filth', u'first', u'folk', u'fucc nicca', u'fuck', u'fuckin', u'game', u'gay', u'get', u'ghetto', u'girl', u'gon', u'good', u'gook', u'got nigga', u'gt gt', u'hate', u'hate hoe', u'hi', u'hire', u'ho', u'hoe', u'hood', u'hope', u'human', u'israel', u'jap', u'jew', u'jihadi', u'kill', u'lame', u'latina', u'let', u'like', u'lol', u'look like', u'love', u'may', u'mexican', u'mock', u'money', u'monkey', u'much', u'muslim', u'muzzi', u'negro', u'nicca', u'nig', u'nigga', u'nigga bitch', u'niggah', u'niggaz', u'nigger', u'nigguh', u'niglet', u'oreo', u'peopl', u'play', u

In [23]:
#Getting names for each class of features
ngram_features = final_feature_list[:final_feature_list.index('yr')+1]
pos_features = final_feature_list[final_feature_list.index('yr')+1:final_feature_list.index('VBD')+1]
oth_features = final_feature_list[final_feature_list.index('VBD')+1:]

## Generating ngram features

In [24]:
new_vocab = {v:i for i, v in enumerate(ngram_features)}
new_vocab_to_index = {}
for k in ngram_features:
    new_vocab_to_index[k] = vocab[k]

In [25]:
#Get indices of text features
ngram_indices = final_features[:len(ngram_features)]

In [26]:
#TODO: Pickle new vectorizer

In [27]:
new_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=tokenize,
    preprocessor=preprocess,
    ngram_range=(1, 3),
    stop_words=stopwords, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_vocab
    )

In [28]:
from sklearn.externals import joblib
joblib.dump(new_vectorizer, 'final_tfidf.pkl') 

['final_tfidf.pkl']

In [29]:
tfidf_ = new_vectorizer.fit_transform(tweets).toarray()

In [30]:
#Verifying that results are the same

In [31]:
tfidf_[1,:]

array([0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0.])

In [32]:
tfidf_[1,:].sum()

2.0

In [33]:
X_[1,:tfidf_.shape[1]]

array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 4.80981477, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 2.81738461, 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [34]:
X_[1,:tfidf_.shape[1]].sum()

7.627199378751962

Results are the same if use IDF but the problem is that IDF will be different if we use different data. Instead we have to use the original IDF scores and multiply them by the new matrix.

In [35]:
idf_vals_ = idf_vals[ngram_indices]

In [36]:
idf_vals_.shape

(155L,)

In [37]:
#TODO: Pickle idf_vals

joblib.dump(idf_vals_, 'final_idf.pkl') 

['final_idf.pkl']

In [38]:
(tfidf_[1,:]*idf_vals_) == X_[1,:153] #Got same value as final process array!

  """Entry point for launching an IPython kernel.


False

In [39]:
tfidf_*idf_vals_ == X_[:,:153]

  """Entry point for launching an IPython kernel.


False

In [40]:
tfidffinal = tfidf_*idf_vals_

## Generating POS features
This is simpler as we do not need to worry about IDF but it will be slower as we have to compute the POS tags for the new data. Here we can simply use the old POS tags.

In [41]:
new_pos = {v:i for i, v in enumerate(pos_features)}

In [42]:
#TODO: Pickle pos vectorizer
#We can use the TFIDF vectorizer to get a token matrix for the POS tags
new_pos_vectorizer = TfidfVectorizer(
    #vectorizer = sklearn.feature_extraction.text.CountVectorizer(
    tokenizer=None,
    lowercase=False,
    preprocessor=None,
    ngram_range=(1, 3),
    stop_words=None, #We do better when we keep stopwords
    use_idf=False,
    smooth_idf=False,
    norm=None, #Applies l2 norm smoothing
    decode_error='replace',
    min_df=1,
    max_df=1.0,
    vocabulary=new_pos
    )

In [43]:
joblib.dump(new_pos_vectorizer, 'final_pos.pkl') 

['final_pos.pkl']

In [44]:
pos_ = new_pos_vectorizer.fit_transform(tweet_tags).toarray()

In [45]:
pos_[1,:]

array([0., 1., 1., 1., 1., 0., 0., 0., 0.])

In [46]:
X_[1,153:159]

array([0., 0., 0., 1., 1., 1.])

In [47]:
pos_[:,:] == X_[:,153:159]

  """Entry point for launching an IPython kernel.


False

In [48]:
pos_[:,:].sum()

95685.0

In [49]:
X_[:,153:159].sum()

33405.363676851324

## Finally, we can look at the other features

In [50]:
print other_features_names

['FKRA', 'FRE', 'num_syllables', 'avg_syl_per_word', 'num_chars', 'num_chars_total', 'num_terms', 'num_words', 'num_unique_words', 'vader neg', 'vader pos', 'vader neu', 'vader compound', 'num_hashtags', 'num_mentions', 'num_urls', 'is_retweet']


In [51]:
print oth_features

[u'FKRA', u'FRE', u'num_syllables', u'num_chars', u'num_chars_total', u'num_terms', u'num_words', u'num_unique_words', u'vader compound', u'num_hashtags', u'num_mentions']


The functions can be modified to only calculate and return necessary fields.

In [52]:
def other_features_(tweet):
    """This function takes a string and returns a list of features.
    These include Sentiment scores, Text and Readability scores,
    as well as Twitter specific features"""
    ##SENTIMENT
    sentiment = sentiment_analyzer.polarity_scores(tweet)
    
    words = preprocess(tweet) #Get text only
    
    syllables = textstat.syllable_count(words) #count syllables in words
    num_chars = sum(len(w) for w in words) #num chars in words
    num_chars_total = len(tweet)
    num_terms = len(tweet.split())
    num_words = len(words.split())
    avg_syl = round(float((syllables+0.001))/float(num_words+0.001),4)
    num_unique_terms = len(set(words.split()))
    
    ###Modified FK grade, where avg words per sentence is just num words/1
    FKRA = round(float(0.39 * float(num_words)/1.0) + float(11.8 * avg_syl) - 15.59,1)
    ##Modified FRE score, where sentence fixed to 1
    FRE = round(206.835 - 1.015*(float(num_words)/1.0) - (84.6*float(avg_syl)),2)
    
    twitter_objs = count_twitter_objs(tweet) #Count #, @, and http://
    features = [FKRA, FRE, syllables, num_chars, num_chars_total, num_terms, num_words,
                num_unique_terms, sentiment['compound'],
                twitter_objs[2], twitter_objs[1],]
    #features = pandas.DataFrame(features)
    return features

def get_feature_array_(tweets):
    feats=[]
    for t in tweets:
        feats.append(other_features_(t))
    return np.array(feats)

In [53]:
feats_ = get_feature_array_(tweets)

In [54]:
feats_[0,:]

array([  8.3   ,  79.94  ,  30.    , 127.    , 140.    ,  25.    ,
        25.    ,  23.    ,   0.4563,   0.    ,   1.    ])

In [55]:
X_[0,159:]

array([  0.    ,   1.    ,   2.    ,   1.    ,   0.    ,   8.3   ,
        79.94  ,  30.    , 127.    , 140.    ,  25.    ,  25.    ,
        23.    ,   0.4563,   0.    ,   1.    ])

In [56]:
feats_[:,:] == X_[:,159:]

  """Entry point for launching an IPython kernel.


False

## Now that we have put it all together using a simplified process we can assess if these new data return the same answers.

In [57]:
M_ = np.concatenate([tfidffinal, pos_, feats_],axis=1)

In [58]:
M_.shape

(24783L, 175L)

In [59]:
X__ = pd.DataFrame(M_)

In [60]:
y_preds_ = model.predict(X__)

In [61]:
report = classification_report( y, y_preds_ )

In [62]:
print(report)

              precision    recall  f1-score   support

           0       0.45      0.58      0.51      1430
           1       0.97      0.91      0.94     19190
           2       0.83      0.96      0.89      4163

   micro avg       0.90      0.90      0.90     24783
   macro avg       0.75      0.82      0.78     24783
weighted avg       0.91      0.90      0.91     24783



OK. So now that we have verified that the results are the same with X_ and X__ we can implement a script that can transform new data in this manner.