In [8]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt

In [9]:
GC_df_2 = pd.read_csv(r"../util/data/FY2018/structured/emotion/GuilfordCountyEmotionDataFY18.csv")
GC_df_2.drop(['Unnamed: 0'], axis=1,inplace=True)
GC_df_2.head(5)

Unnamed: 0,page_number,word,sent_count,sentiment,category
0,537,Fire,46,Fear,Emotion
1,414,Fire,44,Fear,Emotion
2,415,Fire,36,Fear,Emotion
3,534,Fire,31,Fear,Emotion
4,531,Fire,29,Fear,Emotion


In [10]:
import sys
# !{sys.executable} -m spacy download en
import re, numpy as np, pandas as pd
from pprint import pprint

# Gensim
import gensim, spacy, logging, warnings
import gensim.corpora as corpora
from gensim.utils import lemmatize, simple_preprocess
from gensim.models import CoherenceModel
import matplotlib.pyplot as plt

# NLTK Stop words
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
stop_words.extend(['from', 'subject', 're', 'edu', 'use', 'not', 'would', 'say', 'could', '_', 'be', 'know', 'good', 'go', 'get', 'do', 'done', 'try', 'many', 'some', 'nice', 'thank', 'think', 'see', 'rather', 'easy', 'easily', 'lot', 'lack', 'make', 'want', 'seem', 'run', 'need', 'even', 'right', 'line', 'even', 'also', 'may', 'take', 'come'])

%matplotlib inline
warnings.filterwarnings("ignore",category=DeprecationWarning)
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.ERROR)

In [4]:
GC_df = pd.read_csv(r"../util/data/FY2019/structured/emotion/GuilfordCountyEmotionDataFY19.csv")
GC_df.drop(['Unnamed: 0'], axis=1,inplace=True)
GC_df.head(5)

Unnamed: 0,page_number,word,sent_count,sentiment,category
0,447,Fire,46,Fear,Emotion
1,302,Fire,44,Fear,Emotion
2,303,Fire,36,Fear,Emotion
3,444,Fire,31,Fear,Emotion
4,389,County,30,Trust,Emotion


In [11]:
def sent_to_words(sentences):
    for sent in sentences:
        sent = re.sub('\S*@\S*\s?', '', sent)  # remove emails
        sent = re.sub('\s+', ' ', sent)  # remove newline chars
        sent = re.sub("\'", "", sent)  # remove single quotes
        sent = gensim.utils.simple_preprocess(str(sent), deacc=True) 
        yield(sent)  

# Convert to list
data = GC_df_2.word.values.tolist()
data_words = list(sent_to_words(data))
print(data_words[:1])
# [['from', 'irwin', 'arnstein', 'subject', 're', 'recommendation', 'on', 'duc', 'summary', 'whats', 'it', 'worth', 'distribution', 'usa', 'expires', 'sat', 'may', 'gmt', ...trucated...]]

[['fire']]


In [12]:
# Build the bigram and trigram models
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100) # higher threshold fewer phrases.
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)  
bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)

# !python3 -m spacy download en  # run in terminal once
def process_words(texts, stop_words=stop_words, allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV']):
    """Remove Stopwords, Form Bigrams, Trigrams and Lemmatization"""
    texts = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts]
    texts = [bigram_mod[doc] for doc in texts]
    texts = [trigram_mod[bigram_mod[doc]] for doc in texts]
    texts_out = []
    nlp = spacy.load('en', disable=['parser', 'ner'])
    for sent in texts:
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    # remove stopwords once more after lemmatization
    texts_out = [[word for word in simple_preprocess(str(doc)) if word not in stop_words] for doc in texts_out]    
    return texts_out

data_ready = process_words(data_words)  # processed Text Data!

In [7]:
# Create Dictionary
id2word = corpora.Dictionary(data_ready)

# Create Corpus: Term Document Frequency
corpus = [id2word.doc2bow(text) for text in data_ready]

# Build LDA model
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                           id2word=id2word,
                                           num_topics=10, 
                                           passes=10,
                                           alpha = 'auto',
                                           eta = 'auto',
                                           random_state = 1)

pprint(lda_model.print_topics())

[(0,
  '0.185*"grant" + 0.166*"level" + 0.106*"child" + 0.094*"balance" + '
  '0.088*"food" + 0.057*"expense" + 0.053*"technology" + 0.051*"detention" + '
  '0.039*"demand" + 0.033*"long"'),
 (1,
  '0.320*"improve" + 0.305*"budget" + 0.102*"develop" + 0.051*"authorize" + '
  '0.036*"personal" + 0.030*"full" + 0.015*"recovery" + 0.012*"disposal" + '
  '0.011*"innovation" + 0.010*"vision"'),
 (2,
  '0.369*"resource" + 0.216*"continue" + 0.128*"salary" + 0.051*"loss" + '
  '0.030*"expenditure" + 0.027*"population" + 0.027*"prevention" + '
  '0.022*"violence" + 0.010*"illegal" + 0.006*"machine"'),
 (3,
  '0.202*"medical" + 0.145*"income" + 0.091*"present" + 0.080*"improvement" + '
  '0.072*"legal" + 0.068*"soil" + 0.046*"account" + 0.040*"communication" + '
  '0.038*"approval" + 0.026*"stone"'),
 (4,
  '0.140*"management" + 0.126*"actual" + 0.125*"government" + 0.122*"change" + '
  '0.083*"building" + 0.079*"risk" + 0.066*"time" + 0.052*"call" + '
  '0.039*"operation" + 0.028*"serve"'),
 (

In [13]:
# Create Corpus for test data: Term Document Frequency
test_corpus = [id2word.doc2bow(text) for text in data_ready]

In [16]:
test_vecs = []
for i in range(len(GC_df_2)):
    top_topics = lda_model.get_document_topics(test_corpus[i], minimum_probability=0.0)
    topic_vec = [top_topics[i][1] for i in range(10)]
    topic_vec.extend([len(GC_df_2.iloc[i].word)]) # length review
    test_vecs.append(topic_vec)

In [17]:
len(test_vecs)

19475

In [18]:

len(GC_df_2)

19475

In [20]:
GC_df_2.sentiment = GC_df_2.sentiment.replace({"Negative": "0","Positive": "1","Trust" :"1","Sadness":"0","Anticipation":"1","Surprise":"1","Fear":"0","Joy":"1","Anger":"0","Disgust":"0"})

In [21]:
GC_df_2.head()

Unnamed: 0,page_number,word,sent_count,sentiment,category
0,537,Fire,46,0,Emotion
1,414,Fire,44,0,Emotion
2,415,Fire,36,0,Emotion
3,534,Fire,31,0,Emotion
4,531,Fire,29,0,Emotion


In [22]:
GC_df_2['sentiment'] = pd.to_numeric(GC_df_2['sentiment'])
GC_df_2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19475 entries, 0 to 19474
Data columns (total 5 columns):
page_number    19475 non-null int64
word           19475 non-null object
sent_count     19475 non-null int64
sentiment      19475 non-null int64
category       19475 non-null object
dtypes: int64(3), object(2)
memory usage: 760.8+ KB


In [23]:
import numpy as np
X = np.array(test_vecs)

In [25]:
y = np.array(GC_df_2.sentiment)

In [26]:
from pprint import pprint
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import seaborn as sns
%config InlineBackend.figure_formats = ['retina']
from sklearn.metrics import f1_score
from sklearn import linear_model
from sklearn import metrics
from sklearn.model_selection import KFold
from sklearn.metrics import fbeta_score

In [27]:
ss = StandardScaler()
X = ss.fit_transform(X)

lr = LogisticRegression(
  class_weight= None,
  solver='newton-cg',
  fit_intercept=True
  ).fit(X, y)

y_pred_lr = lr.predict(X)
print(f1_score(y, y_pred_lr,average='binary'))

sgd_huber = linear_model.SGDClassifier(
       max_iter=100,
        tol=1e-3,
        alpha=20,
        loss='modified_huber',
        class_weight= None
    ).fit(X, y)
    
y_pred_huber = sgd_huber.predict(X)
print(f1_score(y, y_pred_huber, average='binary'))

0.8775611031997443
0.883026010151702
