# Sentiment Analysis
----
Latihan Sentimen Analysis dari data event GDELT. Data yang diambil adalah pemberitaan ekonomi terkait dengan Indonesia dari beberapa media bisnis dan ekonomi global periode Januari 2016-Agustus 2018 antara lain:
1. www.economics.com
2. www.businesstimes.com
3. www.econotimes.com
4. www.eco-business.com
5. www.aseaneconomist.com
6. www.businessinsider.com
7. www.jakartaglobe.id/economy
8. www.businesswire.com
9. www.wsj.com/news/economy
10. www.reuters.com/news/archive/economicNews
11. www.money.cnn.com/news/economy/
12. www.theguardian.com/business/economics
13. www.marketwatch.com/economy-politics
14. www.nbcnews.com/business/economy
15. www.globalnews.ca/economy/
16. www.straitstimes.com/business/economy
17. www.bloombergquint.com/global-economics 

Jumlah artikel berita yang digunakan sebagai sumber data kurang lebih sejumlah 1200 berita dan seluruhnya berbahasa Inggris.

## Inisiasi Library

In [None]:
import pandas as pd
import sys
from IPython.display import clear_output
import os

## Build Corpus

In [None]:
# bisa langsung diskip jika sudah punya file corpusnya (corpus-economics.pkl)
def get_sentiment(avgTone):
    s = float(avgTone)
    print(s)
    if (s >= 1):
        return 1
    elif (s <= -1):
        return -1
    else:
        return 0
  

In [None]:
# bisa langsung diskip jika sudah punya file corpusnya
news = pd.read_csv('economics2015-2018.csv')
corpus = []
i = 0

for subdir, dirs, files in os.walk('./news/'):
    for file in files:
        filepath = file

        if filepath.endswith(".txt"):
            global_id = filepath[:-4]
            currentRow = news[news['GLOBALEVENTID'] == int(global_id)];
            
            if len(currentRow > 0):            
                with open("./news/"+filepath,"r", encoding='UTF-8') as f:
                    text = f.read()
                    
                    sentiment = get_sentiment(currentRow['AvgTone'])                      
                    corpus.append({'text': text ,'sentiment': sentiment, 'url': currentRow['SOURCEURL'], 'EventID': global_id, 'AvgTone': currentRow['AvgTone'].values[0]})
                    i += 1
                    print('Added :',i, '->', sentiment)

            if (i % 10 == 0):
                clear_output(wait=True)            
            

In [None]:
df_corpus = pd.DataFrame(corpus)
df_corpus.to_pickle('economics.corpus')

In [None]:
df_corpus.groupby('sentiment').count()

## Data Preparation

In [2]:
from sklearn.cross_validation import train_test_split
import numpy as np

df_corpus = pd.read_pickle('corpus/economics.corpus')

X = df_corpus.text
y = df_corpus.sentiment

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)





In [3]:
y_train.value_counts()

-1    545
 1    210
 0    210
Name: sentiment, dtype: int64

In [4]:
#tokenisasi, stemming dsb

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer 
import re

porter_stemmer = PorterStemmer()

def custom_tokenizer(str_input):
    words = re.sub(r"[^a-z]|\b\w{0,3}\b", " ", str_input).lower().split()
    words = [WordNetLemmatizer().lemmatize(word) for word in words]
    words = [porter_stemmer.stem(word) for word in words]    
    return words

# def lemmatizer(str_input):
#     words = re.sub(r"[^a-z]|\b\w{0,3}\b", " ", str_input).lower().split()
#     words = [WordNetLemmatizer().lemmatize(word) for word in words]
#     return words


In [5]:
# ekstraksi fitur

from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize          
from sklearn.feature_extraction import text

my_stop_words = text.ENGLISH_STOP_WORDS.union(["jakarta","jokowi",'said','says','just',
                                               "widodo","joko",'singapore',
                                               'indonesia','indonesians','bali',
                                               'indonesian','thailand','china', 'asia', 'asian',
                                               'vietnam','australian','australia', 'york','s','java','chinese',
                                               'india','malaysia','hong'])
my_stop_words = my_stop_words.union(["january","february","march",'april','may','june','july','august','september','october','november','december'])

my_stop_words = my_stop_words.union(["sunday","monday","tuesday",'thursday','wednesday','friday','year','years'
                                     'days','week','weeks','day','date','time'])


vectorizer = TfidfVectorizer(analyzer=u'word', tokenizer=custom_tokenizer, stop_words=my_stop_words, min_df=0.05, max_df=0.8, lowercase=True)
X = vectorizer.fit_transform(X_train)

X_train_dtm = vectorizer.transform(X_train)

In [None]:
# menampilkan skor kata, bisa diskip

def display_scores(vectorizer, tfidf_result):    
    scores = zip(vectorizer.get_feature_names(),
                 np.asarray(tfidf_result.sum(axis=0)).ravel())
    sorted_scores = sorted(scores, key=lambda x: x[1], reverse=True)
    for item in sorted_scores:
        print("{0:50} Score: {1}".format(item[0], item[1]))

display_scores(vectorizer, X_train_dtm)

In [None]:
# menampilkan data transformation matrix, bisa diskip

pd.DataFrame(X_train_dtm.toarray(), columns=vectorizer.get_feature_names()).sample(10)

# Classification Using Naive Bayes
----

## 1. Training

In [6]:
from sklearn.naive_bayes import MultinomialNB
nb = MultinomialNB()

In [7]:
%time nb.fit(X_train_dtm, y_train)

CPU times: user 3.65 ms, sys: 40 µs, total: 3.69 ms
Wall time: 3.87 ms


MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

## 2. Test

In [8]:
X_test_dtm = vectorizer.transform(X_test)

y_pred_class = nb.predict(X_test_dtm)

In [9]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.8016528925619835

In [10]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[155,   2,   1],
       [ 24,   7,   8],
       [ 10,   3,  32]])

In [11]:
prediction = nb.predict(X_test_dtm)

test_df = pd.DataFrame(
    {'text': X_test,
     'sentiment': y_test,     
     'prediction': prediction,
    })

test_df['correct'] = test_df['sentiment'] == test_df['prediction']

In [12]:
test_df

Unnamed: 0,text,sentiment,prediction,correct
101,"BENSALEM, Pa.--(BUSINESS WIRE)--Law Offices of...",-1,-1,True
260,Dutch national pension fund APB is divesting 3...,-1,-1,True
1083,Indonesia expects to sign an initial deal on S...,-1,-1,True
109,JAKARTA (Reuters) - Indonesia's foreign minist...,-1,-1,True
649,"* Pence calls Indonesia Islam ""an inspiration""...",0,-1,False
736,Shutterstock/Netfalls Remy Musser For a billio...,1,1,True
332,When the earthquake and tsunami hit the city o...,-1,-1,True
49,When Southeast Asia was hit by the worst haze ...,-1,-1,True
461,"Under the Trump administration, the U.S. comme...",-1,-1,True
921,By Beh Lih Yi\n\nJAKARTA (Thomson Reuters Foun...,-1,-1,True


# Classification Using SVM
----

## 1. Training

In [13]:
from sklearn import svm
from sklearn.metrics import accuracy_score

clf = svm.SVC(kernel='linear')
print("Training Classifier...")
%time clf.fit(X_train_dtm, y_train)

Training Classifier...
CPU times: user 1.02 s, sys: 0 ns, total: 1.02 s
Wall time: 1.02 s


SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

## 2. Test

In [14]:
# prediction
prediction = clf.predict(X_test_dtm)

test_df = pd.DataFrame(
    {'text': X_test,
     'sentiment': y_test,     
     'prediction': prediction,
    })

test_df['correct'] = test_df['sentiment'] == test_df['prediction']

y_pred_class = clf.predict(X_test_dtm)

In [15]:
from sklearn import metrics
metrics.accuracy_score(y_test, y_pred_class)

0.7644628099173554

In [16]:
metrics.confusion_matrix(y_test, y_pred_class)

array([[143,  13,   2],
       [ 19,  14,   6],
       [  8,   9,  28]])

In [17]:
test_df

Unnamed: 0,text,sentiment,prediction,correct
101,"BENSALEM, Pa.--(BUSINESS WIRE)--Law Offices of...",-1,-1,True
260,Dutch national pension fund APB is divesting 3...,-1,-1,True
1083,Indonesia expects to sign an initial deal on S...,-1,1,False
109,JAKARTA (Reuters) - Indonesia's foreign minist...,-1,-1,True
649,"* Pence calls Indonesia Islam ""an inspiration""...",0,-1,False
736,Shutterstock/Netfalls Remy Musser For a billio...,1,1,True
332,When the earthquake and tsunami hit the city o...,-1,-1,True
49,When Southeast Asia was hit by the worst haze ...,-1,-1,True
461,"Under the Trump administration, the U.S. comme...",-1,-1,True
921,By Beh Lih Yi\n\nJAKARTA (Thomson Reuters Foun...,-1,-1,True
