In [48]:
import re
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns
import string
import nltk
import warnings 
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English

from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
import spacy
from sklearn.metrics import average_precision_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
#Import warnings filter
from warnings import simplefilter
#Ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings("ignore", category=DeprecationWarning)

%matplotlib inline

In [49]:
from tika import parser
import pandas as pd
import numpy as np

raw2 = parser.from_file('CharlotteCityFunds2008.pdf')
print(raw2['content'])














































General revenues are projected to rebound from the economic downturn that began in FY2002. The City 

continues to face limitations in balancing prior year reductions and continuing increases in service level 

demands. However, City employees continue to work hard to prevent these reductions from negatively 

impacting the level of service provided to the community. Examples of prior year reductions are listed 

below. A complete listing of the FY08 and FY09 unfunded budget requests is provided on the following 

pages. FY03 Delayed Police and Fire recruit classes Froze positions in Street Maintenance Reduced 

funding for landscape maintenance and trash collection along right-of-way Eliminated recycling 

education programs Reduced inventory of Fire supplies Reduced printing, publishing, travel and training, 

and miscellaneous supplies and services FY04 Froze civilian positions in Police Froze positions in Fire 

recruit classes Reduced T




In [50]:
test = list(raw2.values())

In [51]:
import re as re

In [52]:
makeitastring = ''.join(map(str, test))

In [53]:
type(makeitastring)

str

In [54]:
test1 = re.split('(?<!\d)[.]|[.](?!\d)',makeitastring )

In [55]:
df = pd.DataFrame({'text':test1})

In [56]:
df.head()

Unnamed: 0,text
0,200\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\...
1,The City \n\ncontinues to face limitations in...
2,"However, City employees continue to work hard..."
3,Examples of prior year reductions are listed ...
4,A complete listing of the FY08 and FY09 unfun...


In [57]:
df.dropna(subset=['text'], inplace=True)

In [58]:
df['text'] = df['text'].str.replace("[^a-zA-Z#]", " ")

In [59]:
df['text']

0                                                    ...
1       The City   continues to face limitations in b...
2       However  City employees continue to work hard...
3       Examples of prior year reductions are listed ...
4       A complete listing of the FY   and FY   unfun...
5       FY   Delayed Police and Fire recruit classes ...
6         FY   Eliminated all landscaping and grounds...
7             Charlotte Mecklenburg Police           ...
8                  in FY   and            in FY   for...
9       One administrative support position is includ...
10               in FY   and          in FY   for fiv...
11               in FY   and          in FY   for fiv...
12      The positions are three Criminalists  a Laten...
13                 in FY   and FY   for five position...
14               in FY   for   Video Observation Cent...
15               in FY   and FY   for thirteen Police...
16                in FY   and FY   for an additional ...
17                in FY   and F

In [60]:
df = df[:116]

In [61]:
def remove_pattern(input_txt, pattern):
    r = re.findall(pattern, input_txt)
    for i in r:
        input_txt = re.sub(i, '', input_txt)
        
    return input_txt 

In [62]:
df['text'] = np.vectorize(remove_pattern)(df['text'], "@[\w]*")

In [63]:
df.head()

Unnamed: 0,text
0,...
1,The City continues to face limitations in b...
2,However City employees continue to work hard...
3,Examples of prior year reductions are listed ...
4,A complete listing of the FY and FY unfun...


In [64]:
df['text'] = df['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))

In [65]:
df2 = df[df['text'].map(lambda d: len(d)) > 0]

In [66]:
from afinn import Afinn

In [67]:
afinn = Afinn(language='en')

In [68]:
df['afinn_score'] = df['text'].apply(afinn.score)

In [69]:
def label_sentiment (score):   
    if score < 0:
        return '0'
    if score == 0:
        return '1'
    if score > 0:
        return '1'

In [70]:
df['emotion'] = df['afinn_score'].apply(label_sentiment)

In [71]:
df.head()

Unnamed: 0,text,afinn_score,emotion
0,General revenues projected rebound from econom...,0.0,1
1,City continues face limitations balancing prio...,-1.0,0
2,However City employees continue work hard prev...,-2.0,0
3,Examples prior year reductions listed below,0.0,1
4,complete listing unfunded budget requests prov...,0.0,1


In [72]:
#Load English tokenizer, tagger, parser and word vectors
#nlp = spacy.load('en')
#nlp = spacy.load('en_core_web_sm')
#Import stop words from English language
stop_words = spacy.lang.en.stop_words.STOP_WORDS

#Load English tokenizer, tagger, parser, NER and word vectors
parser = English()

#Create function to strip information we don’t need, like stopwords and punctuation, from each review
def review_tokenizer(sentence):
    #Create token object, which is used to create documents with linguistic annotations
    mytokens = parser(sentence)

    #Lemmatizing each token and converting each token into lowercase, if it is a personal pronoun, don't lemmatize, just convert to lowercase
    mytokens = [token.lemma_.lower().strip() if token.lemma_ != "-PRON-" else token.lower_ for token in mytokens]

    # Removing stop words
    #mytokens = [word for word in mytokens if word not in stop_words and word not in punctuations]

    # return preprocessed list of tokens
    return mytokens

In [76]:
#The features we want to analyze
X = df['text']

#The labels we want to test against
ylabels = df['emotion'] 

#Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2)

In [79]:
#List of classifiers
classifiers = [LogisticRegression()]
  
#This vectorizer breaks text into single words and bi-grams and then calculates the TF-IDF representation
tfidf_vector = TfidfVectorizer(tokenizer = review_tokenizer, ngram_range=(1,2)).fit(X_train)


X_train_vectorized = tfidf_vector.transform(X_train)

#Let us see the prediction accuracy for each classifier
for classifier in classifiers:
    model = classifier
    
    #Fit the model on the vectorized matrix
    model.fit(X_train_vectorized, y_train)
    
    #Predict the rating
    predicted = model.predict(tfidf_vector.transform(X_test))
   
    #Calculate error between actual values and predicted values
    mse = mean_squared_error(y_test, predicted)
    rmse = np.sqrt(mse)
    print ("RMSE :", rmse)
    accuracy = accuracy_score(y_test, predicted)
    print("Accuracy : %.2f%%" % (accuracy * 100.0))

RMSE : 0.28867513459481287
Accuracy : 91.67%
