# Sentiment Analysis
----
Latihan Sentimen Analysis dari data event GDELT. Data yang diambil adalah pemberitaan ekonomi terkait dengan Indonesia dari beberapa media bisnis dan ekonomi global periode Januari 2016-Agustus 2018 antara lain:
1. www.economics.com
2. www.businesstimes.com
3. www.econotimes.com
4. www.eco-business.com
5. www.aseaneconomist.com
6. www.businessinsider.com
7. www.jakartaglobe.id/economy
8. www.businesswire.com
9. www.wsj.com/news/economy
10. www.reuters.com/news/archive/economicNews
11. www.money.cnn.com/news/economy/
12. www.theguardian.com/business/economics
13. www.marketwatch.com/
14. www.nbcnews.com/business/economy
15. www.globalnews.ca/economy/
16. www.straitstimes.com/business/economy
17. www.bloombergquint.com/global-economics 

Jumlah artikel berita yang digunakan sebagai sumber data kurang lebih sejumlah 1200 berita dan seluruhnya berbahasa Inggris.

## Inisiasi Library

In [2]:
import pandas as pd
import sys
from IPython.display import clear_output
import os

## Build Corpus

In [None]:
# bisa langsung diskip jika sudah punya file corpusnya (corpus-economics.pkl)
def get_sentiment(avgTone):
    s = float(avgTone)
    print(s)
    if (s >= 1):
        return 1
    elif (s <= -1):
        return -1
    else:
        return 0

    

In [None]:
# bisa langsung diskip jika sudah punya file corpusnya
news = pd.read_csv('economics2016-2018.csv')
corpus = []
i = 0


for subdir, dirs, files in os.walk('./news/'):
    for file in files:
        filepath = file

        if filepath.endswith(".txt"):
            global_id = filepath[:-4]
            currentRow = news[news['GLOBALEVENTID'] == int(global_id)];
            
            if len(currentRow > 0):            
                with open("./news/"+filepath,"r", encoding='UTF-8') as f:
                    text = f.read()
                    
                    sentiment = get_sentiment(currentRow['AvgTone'])                      
                    corpus.append({'text': text ,'sentiment': sentiment, 'url': currentRow['SOURCEURL'].values[0], 'EventID': global_id, 'AvgTone': currentRow['AvgTone'].values[0]})
                    i += 1
                    print('Added :',i, '->', sentiment)

            if (i % 10 == 0):
                clear_output(wait=True)            



In [None]:
df_corpus = pd.DataFrame(corpus)
df_corpus.to_pickle('corpus/economics.corpus')

In [None]:
df_corpus.groupby('sentiment').count()

## Data Preparation

In [113]:
from sklearn.model_selection import train_test_split
import numpy as np

titles = pd.read_csv('titles.csv')
df_corpus = pd.read_pickle('corpus/economics.corpus')

titles['EventID']=titles['EventID'].apply(int)
df_corpus['EventID']=df_corpus['EventID'].apply(int)
# df_corpus = df_corpus.sort_values('AvgTone', ascending=True).groupby('sentiment').head(250)

X = df_corpus.text
y = df_corpus.sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state= 42)

df_corpus = pd.merge(df_corpus, titles, on='EventID')


In [114]:
y_train.value_counts()

-1    545
 1    210
 0    210
Name: sentiment, dtype: int64

In [115]:
#tokenisasi, stemming dsb

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
from nltk.stem.snowball import SnowballStemmer
import nltk
from sklearn.metrics.pairwise import cosine_similarity

import re

porter_stemmer = PorterStemmer()
stemmer = SnowballStemmer("english")

def custom_tokenizer(str_input):
    words = re.sub(r"[^a-z]|\b\w{0,3}\b", " ", str_input).lower().split()
    words = [WordNetLemmatizer().lemmatize(word) for word in words]
    words = [stemmer.stem(word) for word in words]    
    return words
   
def get_similarity_matrix(content_as_str):
    tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000, min_df=0.2,
                                       stop_words='english',use_idf=True,
                                       tokenizer=tokenize_and_stem, ngram_range=(1,3))
    tfidf_matrix = tfidf_vectorizer.fit_transform(content_as_str) #fit the vectorizer to synopses
    similarity_matrix = cosine_similarity(tfidf_matrix)
    return (similarity_matrix, tfidf_matrix)

def display_scores(vectorizer, tfidf_result):    
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores:
        print("{0:50} Score: {1}".format(item[0], item[1]))

In [145]:
# ekstraksi fitur

from sklearn.feature_extraction.text import TfidfVectorizer  
from sklearn.feature_extraction.text import CountVectorizer  
from sklearn.feature_extraction import text

my_stop_words = text.ENGLISH_STOP_WORDS.union(["january","february","march",'april','may','june','july','august','september','october','november','december'])
my_stop_words = my_stop_words.union(["jakarta","jokowi",'said','says','just','mr',
                                               "widodo","joko",'singapore','www','http','https',
                                               'indonesia','indonesians','bali','rupiah',
                                               'indonesian','thailand','china', 'asia', 'asian',
                                               'vietnam','australian','australia', 'sentifi', 'york','s','java','chinese',
                                               'india','malaysia','hong','used'])

my_stop_words = my_stop_words.union(["sunday","monday","tuesday",'thursday','wednesday','friday','year','years'
                                     'days','week','weeks','day','date','time'])

# 
vectorizer = CountVectorizer(analyzer=u'word', tokenizer=custom_tokenizer, ngram_range=(1,4), stop_words=my_stop_words, min_df=0.05, max_df=0.9, lowercase=True)
X = vectorizer.fit_transform(X_train)

X_train_dtm = vectorizer.transform(X_train)

X_dtm = vectorizer.transform(df_corpus.text)

  'stop_words.' % sorted(inconsistent))


In [136]:
df_corpus['matrix'] = X_dtm

In [119]:
# menampilkan skor kata, bisa diskip
# len(vectorizer.get_feature_names())
# display_scores(vectorizer, X_train_dtm)

from nltk.tokenize import word_tokenize, RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
stop_words = text.ENGLISH_STOP_WORDS.union('s')

def process_text(headlines):
    tokens = []
    for line in headlines:
        toks = tokenizer.tokenize(line)
        toks = [t.lower() for t in toks if t.lower() not in stop_words]
        tokens.extend(toks)
    
    return tokens


pos_lines = list(df_corpus[df_corpus.sentiment >= -1].text)

pos_tokens = process_text(pos_lines)
pos_freq = nltk.FreqDist(pos_tokens)

# pos_freq.most_common(400)


In [137]:
# menampilkan data transformation matrix, bisa diskip

pd.DataFrame(X_train_dtm.toarray(), columns=vectorizer.get_feature_names()).sample(10)

Unnamed: 0,abl,abov,accept,access,accord,account,accus,achiev,action,activ,...,work,worker,world,world bank,world largest,worri,worst,worth,write,zone
848,2,0,0,0,0,0,0,1,0,0,...,1,0,8,0,0,0,0,0,0,0
959,0,0,0,0,0,0,0,0,0,2,...,0,0,0,0,0,0,0,0,0,1
51,0,3,0,0,0,0,0,0,2,3,...,0,0,5,0,0,0,0,0,0,0
25,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
611,1,0,0,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
511,0,0,0,0,2,0,0,0,0,0,...,0,0,2,0,0,0,0,0,1,0
878,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
383,0,0,0,0,0,0,0,0,0,0,...,0,0,2,0,0,0,0,0,0,0
333,0,0,1,0,1,0,0,0,0,0,...,0,0,4,0,0,0,0,0,0,0
96,1,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,1,0,0,0,0


# Classification Using Naive Bayes
----

## 1. Training

In [138]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
nb = MultinomialNB(alpha=0.2)

In [139]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 1.31 ms, sys: 3.99 ms, total: 5.29 ms
Wall time: 5.92 ms


MultinomialNB(alpha=0.2, class_prior=None, fit_prior=True)

## 2. Test

In [140]:
X_test_dtm = vectorizer.transform(X_test)

y_pred_class = nb.predict(X_test_dtm)


In [141]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.6431535269709544

In [142]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[111,  43,   3],
       [ 12,  14,  15],
       [  1,  12,  30]])

In [143]:
prediction = nb.predict(X_test_dtm)

test_df = pd.DataFrame(
    {'url': df_corpus.iloc[X_test.index,:]['url'],
     'title': df_corpus.iloc[X_test.index,:]['Title'],
     'sentiment': y_test,          
     'prediction': prediction,
    })

test_df['correct'] = test_df['sentiment'] == test_df['prediction']

In [89]:
test_df.to_csv('test_result.csv')

In [144]:
from sklearn.model_selection import cross_validate

X_dtm = vectorizer.transform(df_corpus.text)

cv_results = cross_validate(nb, X_dtm, df_corpus.sentiment, cv=5,
                            return_train_score=False)
sorted(cv_results.keys())                         

print(cv_results)
print('\n')
np.mean(cv_results['test_score'])

{'fit_time': array([0.00345254, 0.0034411 , 0.00355339, 0.00376844, 0.00354004]), 'score_time': array([0.00051332, 0.00053573, 0.00055456, 0.00055528, 0.0005126 ]), 'test_score': array([0.67355372, 0.73029046, 0.6846473 , 0.6473029 , 0.62916667])}




0.6729922099150691

In [128]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=False, random_state=42)
kf.get_n_splits(df_corpus)
accuracy = []
for train_index, test_index in kf.split(df_corpus):   
    X_train, X_test = df_corpus.text[train_index], df_corpus.text[test_index]
    y_train, y_test = df_corpus.sentiment[train_index], df_corpus.sentiment[test_index]
    
    
    X = vectorizer.fit_transform(X_train)
    X_train_dtm = vectorizer.transform(X_train)
    
    nb.fit(X_train_dtm, y_train)    
    X_test_dtm = vectorizer.transform(X_test)   
    y_pred_class = nb.predict(X_test_dtm)
    score = metrics.accuracy_score(y_test, y_pred_class)
    
    accuracy.append(score)

print(accuracy)
print(np.mean(accuracy))

[0.7012448132780082, 0.6887966804979253, 0.7261410788381742, 0.7136929460580913, 0.7510373443983402]
0.7161825726141079


# Classification Using SVM
----

## 1. Training

In [129]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC(kernel='linear')
print("Training Classifier...")
%time clf.fit(X_train_dtm, y_train)

Training Classifier...
CPU times: user 990 ms, sys: 20 ms, total: 1.01 s
Wall time: 808 ms


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

## 2. Test

In [130]:
# prediction
prediction = clf.predict(X_test_dtm)

test_df = pd.DataFrame(
    {'url': df_corpus.iloc[X_test.index,:]['url'],
     'title': df_corpus.iloc[X_test.index,:]['Title'],
     'sentiment': y_test,          
     'prediction': prediction,
    })

test_df['correct'] = test_df['sentiment'] == test_df['prediction']

In [131]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7510373443983402

In [132]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[142,  13,   2],
       [ 17,  12,  12],
       [  8,   8,  27]])

In [107]:
test_df

Unnamed: 0,url,title,sentiment,prediction,correct
101,2026 http://www.businesswire.com/news/home/...,INVESTOR ALERT: Investigation of Freeport-McMo...,-1,-1,True
260,443 http://www.businessinsider.com/r-chevro...,Chevron renewal of Rokan block not assured: In...,-1,-1,True
1083,1900 http://www.eco-business.com/news/deliv...,Delivering Singapore’s smart energy dream,-1,0,False
109,1880 http://www.businessinsider.com/r-indon...,Indonesia says has no overlapping South China ...,-1,-1,True
649,1493 http://www.econotimes.com/Jakarta-gove...,Jakarta governor election results in a victory...,0,-1,False
736,403 http://www.businesstimes.com.sg/governm...,Indonesia jails former parliament speaker for ...,1,1,True
332,1058 http://www.econotimes.com/Reviewing-In...,Reviewing Indonesia's tsunami early warning st...,-1,-1,True
49,304 http://www.eco-business.com/news/have-w...,Have we overlooked the human side of palm oil ...,-1,-1,True
461,1291 http://www.businessinsider.com/r-updat...,UPDATE 1-Freeport Indonesia reopens mine acces...,-1,-1,True
921,1762 http://www.businesstimes.com.sg/govern...,Indonesia gives tax amnesty participants more ...,-1,-1,True


In [108]:
from sklearn.model_selection import cross_validate

X_dtm = vectorizer.transform(df_corpus.text)

cv_results = cross_validate(clf, X_dtm, df_corpus.sentiment, cv=5,
                            return_train_score=True)
sorted(cv_results.keys())                         

cv_results

{'fit_time': array([0.77342415, 0.75745606, 0.79421258, 0.81148314, 0.78993702]),
 'score_time': array([0.18846345, 0.18435788, 0.19420815, 0.15530753, 0.15404296]),
 'test_score': array([0.73140496, 0.7219917 , 0.7593361 , 0.73443983, 0.73333333]),
 'train_score': array([0.91796469, 0.906639  , 0.91390041, 0.92012448, 0.91606218])}

In [109]:
np.mean(cv_results['test_score'])

0.7361011853731583

In [62]:
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, shuffle=True, random_state=42)
kf.get_n_splits(df_corpus)
accuracy = []
for train_index, test_index in kf.split(df_corpus):   
    X_train, X_test = df_corpus.text[train_index], df_corpus.text[test_index]
    y_train, y_test = df_corpus.sentiment[train_index], df_corpus.sentiment[test_index]
    
    
    X = vectorizer.fit_transform(X_train)
    X_train_dtm = vectorizer.transform(X_train)
    
    clf.fit(X_train_dtm, y_train)    
    X_test_dtm = vectorizer.transform(X_test)   
    y_pred_class = clf.predict(X_test_dtm)
    score = metrics.accuracy_score(y_test, y_pred_class)
    
    accuracy.append(score)

print(accuracy)
print(np.mean(accuracy))

[0.7427385892116183, 0.7676348547717843, 0.7302904564315352, 0.7676348547717843, 0.7344398340248963]
0.7485477178423237
