# Sentiment Analysis
----

## Scraping Data

In [1]:
from newspaper import Article
import pandas as pd
import sys
from IPython.display import clear_output

errorCount = 0
successCount = 0
counter = 0

news = pd.read_csv('economics2015.csv')
news.drop_duplicates(subset=['SOURCEURL'], keep='first', inplace=True)

In [None]:
t = len(news.values);

for index, item in news.iterrows():
    counter += 1
    if (counter % 10 == 0):
        clear_output(wait=True)
        
    try:        
        url = item['SOURCEURL']              
        print("Scrapping ", url, " -> ", str(counter)," of ", str(t)," -> ", round(counter/t * 100, 2), " %")        
        article = Article(url)
        article.download()
        article.parse()

        with open('2015/'+str(item['GLOBALEVENTID'])+'.txt', 'w') as outfile:              
            outfile.write(article.text)
            print('Scraping ',url,'Done..')            

        successCount += 1
    except:
        errorCount += 1        
        print("Error scrapping:", sys.exc_info())        
                   
    print('success:',successCount)
    print('error:',errorCount)

## Build Corpus

In [6]:
def get_sentiment(avgTone):
    s = float(avgTone)
    print(s)
    if (s >= 1):
        return 1
    elif (s <= -1):
        return -1
    else:
        return 0
  

In [7]:
import os

corpus = []
i = 0

for subdir, dirs, files in os.walk('./EconomicNews/2018/'):
    for file in files:
        filepath = file

        if filepath.endswith(".txt"):
            global_id = filepath[:-4]
            currentRow = news[news['GLOBALEVENTID'] == int(global_id)];
            
            if len(currentRow > 0):            
                with open("./EconomicNews/2018/"+filepath,"r", encoding='UTF-8') as f:
                    text = f.read()
                    
                    sentiment = get_sentiment(currentRow['AvgTone'])                      
                    corpus.append({'text': text ,'sentiment': sentiment, 'url': currentRow['SOURCEURL'], 'EventID': global_id, 'AvgTone': currentRow['AvgTone'].values[0]})
                    i += 1
                    print('Added :',i, '->', sentiment)

            if (i % 10 == 0):
                clear_output(wait=True)            
            

-2.18292402414654
Added : 1201 -> -1
1.4563106796116498
Added : 1202 -> 1
2.4723838066992103
Added : 1203 -> 1
1.6427104722792603
Added : 1204 -> 1
-6.66685134398922
Added : 1205 -> -1
-1.4785466327238999
Added : 1206 -> -1
0.39525691699605
Added : 1207 -> 0


In [8]:
df_corpus = pd.DataFrame(corpus)
df_corpus.to_pickle('corpus-economics.pkl')

In [9]:
df_corpus.groupby('sentiment').count()

Unnamed: 0_level_0,AvgTone,EventID,text,url
sentiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
-1,703,703,703,703
0,249,249,249,249
1,255,255,255,255


## Data Preparation

In [49]:
from sklearn.cross_validation import train_test_split
import numpy as np

df_corpus = pd.read_pickle('corpus-economics.pkl')

X = df_corpus.text
y = df_corpus.sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [50]:
aa = pd.DataFrame(np.c_[X_train, y_train])

aa.groupby(1).count()

Unnamed: 0_level_0,0
1,Unnamed: 1_level_1
-1,629
0,227
1,230


In [51]:
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import re

porter_stemmer = PorterStemmer()

def stemming_tokenizer(str_input):
    words = re.sub(r"[^a-z]|\b\w{0,3}\b", " ", str_input).lower().split()
#     words = [porter_stemmer.stem(word) for word in words]
    return words

def lemmatizer(str_input):
    words = re.sub(r"[^a-z]|\b\w{0,3}\b", " ", str_input).lower().split()
    words = [WordNetLemmatizer().lemmatize(word) for word in words]
    return words


In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from nltk import word_tokenize          
from sklearn.feature_extraction import text

my_stop_words = text.ENGLISH_STOP_WORDS.union(["jakarta","jokowi",'said','says','just',
                                               "widodo","joko",'singapore',
                                               'indonesia','indonesians','bali',
                                               'indonesian','thailand','china', 'asia', 'asian',
                                               'vietnam','australian','australia', 'york','s','java'
                                               'india','malaysia','hong'])
my_stop_words = my_stop_words.union(["january","february","march",'april','may','june','july','august','september','october','november','december'])

my_stop_words = my_stop_words.union(["sunday","monday","tuesday",'thursday','wednesday','friday',
                                     'days','week','weeks','day','date','time','world'])

# token_pattern='[a-z]+',
# tokenizer=stemming_tokenizer
vectorizer = TfidfVectorizer(analyzer=u'word', tokenizer=lemmatizer, stop_words=my_stop_words, min_df=0.05, max_df=0.9, lowercase=True)
# vectorizer = CountVectorizer(analyzer=u'word', tokenizer=lemmatizer, stop_words=my_stop_words, lowercase=True)
X = vectorizer.fit_transform(X_train)

X_train_dtm = vectorizer.transform(X_train)

In [53]:
import numpy as np

def display_scores(vectorizer, tfidf_result):
    # http://stackoverflow.com/questions/16078015/
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores:
        print("{0:50} Score: {1}".format(item[0], item[1]))

display_scores(vectorizer, X_train_dtm)

company                                            Score: 55.46607750974544
year                                               Score: 52.43951042609571
palm                                               Score: 43.24177396277267
government                                         Score: 42.59631893313317
market                                             Score: 42.57098061242344
cent                                               Score: 40.094842718448874
country                                            Score: 40.0053724774783
people                                             Score: 34.43274950598077
business                                           Score: 33.58309405027161
group                                              Score: 33.363640212542215
million                                            Score: 33.2194832080941
police                                             Score: 30.588623060545135
forest                                             Score: 30.580979599976715
state     

In [54]:
pd.DataFrame(X_train_dtm.toarray(), columns=vectorizer.get_feature_names()).sample(10)

Unnamed: 0,able,access,according,account,accused,achieve,action,active,activist,activity,...,woman,work,worked,worker,working,worst,worth,writing,year,zone
572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.092749,0.070844,0.0
525,0.0,0.107714,0.035062,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.062238,0.0,0.0,0.0,0.098382,0.0,0.0,0.0,0.045919,0.0
1032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.070372,0.0,0.078163,0.0
449,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.069868,0.0,0.0,0.0,0.0,0.0
1007,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
628,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.083333,0.0
113,0.0,0.0,0.058534,0.0,0.0,0.0,0.167107,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
872,0.035376,0.0,0.0,0.018309,0.0,0.0,0.0,0.0,0.0,0.0,...,0.021158,0.041215,0.04508,0.0,0.016723,0.0,0.0,0.0,0.156105,0.0
984,0.0,0.0,0.0,0.0,0.06714,0.0,0.0,0.0,0.0,0.0,...,0.064622,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
610,0.0,0.0,0.056049,0.043047,0.0,0.0,0.080007,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.039317,0.0,0.0,0.0,0.110107,0.0


# Classification Using Naive Bayes
----

## 1. Training

In [55]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [56]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 4.07 ms, sys: 128 µs, total: 4.2 ms
Wall time: 6.44 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## 2. Test

In [57]:
X_test_dtm = vectorizer.transform(X_test)

y_pred_class = nb.predict(X_test_dtm)

In [58]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7603305785123967

In [59]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[73,  0,  1],
       [15,  1,  6],
       [ 7,  0, 18]])

In [60]:
prediction = nb.predict(X_test_dtm)

test_df = pd.DataFrame(
    {'text': X_test,
     'sentiment': y_test,     
     'prediction': prediction,
    })

test_df['correct'] = test_df['sentiment'] == test_df['prediction']

In [61]:
test_df

Unnamed: 0,text,sentiment,prediction,correct
101,"BENSALEM, Pa.--(BUSINESS WIRE)--Law Offices of...",-1,-1,True
260,Dutch national pension fund APB is divesting 3...,-1,-1,True
1083,Indonesia expects to sign an initial deal on S...,-1,-1,True
109,JAKARTA (Reuters) - Indonesia's foreign minist...,-1,-1,True
649,"* Pence calls Indonesia Islam ""an inspiration""...",0,-1,False
736,Shutterstock/Netfalls Remy Musser For a billio...,1,1,True
332,When the earthquake and tsunami hit the city o...,-1,-1,True
49,When Southeast Asia was hit by the worst haze ...,-1,-1,True
461,"Under the Trump administration, the U.S. comme...",-1,-1,True
921,By Beh Lih Yi\n\nJAKARTA (Thomson Reuters Foun...,-1,-1,True


# Classification Using SVM
----

## 1. Training

In [62]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC(kernel='linear')
print("Training Classifier...")
%time clf.fit(X_train_dtm, y_train)

Training Classifier...
CPU times: user 920 ms, sys: 3.82 ms, total: 924 ms
Wall time: 924 ms


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## 2. Test

In [63]:
# prediction
prediction = clf.predict(X_test_dtm)

test_df = pd.DataFrame(
    {'text': X_test,
     'sentiment': y_test,     
     'prediction': prediction,
    })

test_df['correct'] = test_df['sentiment'] == test_df['prediction']

In [64]:
test_df

Unnamed: 0,text,sentiment,prediction,correct
101,"BENSALEM, Pa.--(BUSINESS WIRE)--Law Offices of...",-1,-1,True
260,Dutch national pension fund APB is divesting 3...,-1,-1,True
1083,Indonesia expects to sign an initial deal on S...,-1,-1,True
109,JAKARTA (Reuters) - Indonesia's foreign minist...,-1,-1,True
649,"* Pence calls Indonesia Islam ""an inspiration""...",0,-1,False
736,Shutterstock/Netfalls Remy Musser For a billio...,1,1,True
332,When the earthquake and tsunami hit the city o...,-1,-1,True
49,When Southeast Asia was hit by the worst haze ...,-1,-1,True
461,"Under the Trump administration, the U.S. comme...",-1,-1,True
921,By Beh Lih Yi\n\nJAKARTA (Thomson Reuters Foun...,-1,-1,True


In [65]:
accuracy = accuracy_score(y_test, prediction)

print('Accuracy:', accuracy)

Accuracy: 0.7851239669421488
