In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [2]:
GC_df = pd.read_csv(r"../util/data/FY2019/structured/emotion/GuilfordCountyEmotionDataFY19.csv")
GC_df.drop(['Unnamed: 0'], axis=1,inplace=True)
GC_df.head(5)

Unnamed: 0,page_number,word,sent_count,sentiment,category
0,447,Fire,46,Fear,Emotion
1,302,Fire,44,Fear,Emotion
2,303,Fire,36,Fear,Emotion
3,444,Fire,31,Fear,Emotion
4,389,County,30,Trust,Emotion


In [3]:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [5]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = GC_df.word.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])
# [['from', 'irwin', 'arnstein', 'subject', 're', 'recommendation', 'on', 'duc', 'summary', 'whats', 'it', 'worth', 'distribution', 'usa', 'expires', 'sat', 'may', 'gmt', ...trucated...]]

[['fire']]


In [6]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [7]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           passes=10,
                                           alpha = 'auto',
                                           eta = 'auto',
                                           random_state = 1)

pprint(lda_model.print_topics())

[(0,
  '0.185*"grant" + 0.166*"level" + 0.106*"child" + 0.094*"balance" + '
  '0.088*"food" + 0.057*"expense" + 0.053*"technology" + 0.051*"detention" + '
  '0.039*"demand" + 0.033*"long"'),
 (1,
  '0.320*"improve" + 0.305*"budget" + 0.102*"develop" + 0.051*"authorize" + '
  '0.036*"personal" + 0.030*"full" + 0.015*"recovery" + 0.012*"disposal" + '
  '0.011*"innovation" + 0.010*"vision"'),
 (2,
  '0.369*"resource" + 0.216*"continue" + 0.128*"salary" + 0.051*"loss" + '
  '0.030*"expenditure" + 0.027*"population" + 0.027*"prevention" + '
  '0.022*"violence" + 0.010*"illegal" + 0.006*"machine"'),
 (3,
  '0.202*"medical" + 0.145*"income" + 0.091*"present" + 0.080*"improvement" + '
  '0.072*"legal" + 0.068*"soil" + 0.046*"account" + 0.040*"communication" + '
  '0.038*"approval" + 0.026*"stone"'),
 (4,
  '0.140*"management" + 0.126*"actual" + 0.125*"government" + 0.122*"change" + '
  '0.083*"building" + 0.079*"risk" + 0.066*"time" + 0.052*"call" + '
  '0.039*"operation" + 0.028*"serve"'),
 (

In [9]:
GC_df.sentiment = GC_df.sentiment.replace({"Negative": "0","Positive": "1","Trust" :"1","Sadness":"0","Anticipation":"1","Surprise":"1","Fear":"0","Joy":"1","Anger":"0","Disgust":"0"})

In [10]:
GC_df.head()

Unnamed: 0,page_number,word,sent_count,sentiment,category
0,447,Fire,46,0,Emotion
1,302,Fire,44,0,Emotion
2,303,Fire,36,0,Emotion
3,444,Fire,31,0,Emotion
4,389,County,30,1,Emotion


In [11]:

GC_df['sentiment'] = pd.to_numeric(GC_df['sentiment'])
GC_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17148 entries, 0 to 17147
Data columns (total 5 columns):
page_number    17148 non-null int64
word           17148 non-null object
sent_count     17148 non-null int64
sentiment      17148 non-null int64
category       17148 non-null object
dtypes: int64(3), object(2)
memory usage: 669.9+ KB


In [12]:
GC_df.head()

Unnamed: 0,page_number,word,sent_count,sentiment,category
0,447,Fire,46,0,Emotion
1,302,Fire,44,0,Emotion
2,303,Fire,36,0,Emotion
3,444,Fire,31,0,Emotion
4,389,County,30,1,Emotion


In [13]:
train_vecs = []
for i in range(len(GC_df)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    #The features below were added to increase 
    topic_vec.extend([GC_df.iloc[i].sent_count]) # counts of reviews for restaurant
    topic_vec.extend([len(GC_df.iloc[i].word)]) # length review
    train_vecs.append(topic_vec)

In [109]:
train_vecs[2]

[0.04846649,
 0.042821117,
 0.03781131,
 0.0386842,
 0.055064,
 0.050130684,
 0.043984495,
 0.087888956,
 0.54818475,
 0.046964042,
 36,
 4]

In [110]:
from pprint import pprint
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
from sklearn.metrics import f1_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score

In [111]:
X = np.array(train_vecs)

In [112]:
y = np.array(GC_df.sentiment)

In [113]:

kf = KFold(5, shuffle=True, random_state=42)
cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []

In [94]:
for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    cv_lr_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # Logistic Regression Mini-Batch SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd.predict(X_val_scale)
    cv_lrsgd_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # SGD Modified Huber
    sgd_huber = linear_model.SGDClassifier(
        max_iter=100,
        tol=1e-3,
        alpha=20,
        loss='modified_huber',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd_huber.predict(X_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average='binary'))

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


In [24]:
print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')

Logistic Regression Val f1: 0.692 +- 0.006
Logisitic Regression SGD Val f1: 0.655 +- 0.098


In [25]:
train_vecs = []
for i in range(len(GC_df)):
    top_topics = lda_model.get_document_topics(corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    train_vecs.append(topic_vec)

In [26]:
X = np.array(train_vecs)

In [27]:
kf = KFold(5, shuffle=True, random_state=42)
cv_lr_f1, cv_lrsgd_f1, cv_svcsgd_f1,  = [], [], []

In [28]:
for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= 'balanced',
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    cv_lr_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # Logistic Regression Mini-Batch SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd.predict(X_val_scale)
    cv_lrsgd_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # SGD Modified Huber
    sgd_huber = linear_model.SGDClassifier(
        max_iter=100,
        tol=1e-3,
        alpha=20,
        loss='modified_huber',
        class_weight='balanced'
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd_huber.predict(X_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average='binary'))

  'precision', 'predicted', average, warn_for)


In [35]:
print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')

Logistic Regression Val f1: 0.799 +- 0.005
Logisitic Regression SGD Val f1: 0.552 +- 0.134


In [36]:
print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')

SVM Huber Val f1: 0.696 +- 0.348


In [114]:
for train_ind, val_ind in kf.split(X, y):
    # Assign CV IDX
    X_train, y_train = X[train_ind], y[train_ind]
    X_val, y_val = X[val_ind], y[val_ind]
    
    # Scale Data
    scaler = StandardScaler()
    X_train_scale = scaler.fit_transform(X_train)
    X_val_scale = scaler.transform(X_val)

    # Logisitic Regression
    lr = LogisticRegression(
        class_weight= None,
        solver='newton-cg',
        fit_intercept=True
    ).fit(X_train_scale, y_train)

    y_pred = lr.predict(X_val_scale)
    cv_lr_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # Logistic Regression Mini-Batch SGD
    sgd = linear_model.SGDClassifier(
        max_iter=1000,
        tol=1e-3,
        loss='log',
        class_weight= None
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd.predict(X_val_scale)
    cv_lrsgd_f1.append(f1_score(y_val, y_pred, average='binary'))
    
    # SGD Modified Huber
    sgd_huber = linear_model.SGDClassifier(
            max_iter=100,
            tol=1e-3,
            alpha=20,
            loss='modified_huber',
            class_weight= None
    ).fit(X_train_scale, y_train)
    
    y_pred = sgd_huber.predict(X_val_scale)
    cv_svcsgd_f1.append(f1_score(y_val, y_pred, average='binary'))

In [115]:
print(f'Logistic Regression Val f1: {np.mean(cv_lr_f1):.3f} +- {np.std(cv_lr_f1):.3f}')
print(f'Logisitic Regression SGD Val f1: {np.mean(cv_lrsgd_f1):.3f} +- {np.std(cv_lrsgd_f1):.3f}')
print(f'SVM Huber Val f1: {np.mean(cv_svcsgd_f1):.3f} +- {np.std(cv_svcsgd_f1):.3f}')

Logistic Regression Val f1: 0.869 +- 0.003
Logisitic Regression SGD Val f1: 0.855 +- 0.008
SVM Huber Val f1: 0.870 +- 0.003


In [71]:
    lda_model.save('lda_train.model')