# Import

In [1]:
import numpy as np
import pandas as pd
import re
import string
from langdetect import detect
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from textblob import Word

# Data

In [2]:
positive_en_dat = pd.read_fwf('bilanguage-subjectivity/positive-english', header=None)
negative_en_dat = pd.read_fwf('bilanguage-subjectivity/negative-english', header=None)
positive_my_dat = pd.read_fwf('bilanguage-subjectivity/subjectivity-positivity-bm.txt', header=None)
negative_my_dat = pd.read_fwf('bilanguage-subjectivity/subjectivity-negative-bm.txt', header=None)
print("Positive EN", positive_en_dat.shape)
print("Negative EN", negative_en_dat.shape)
print("Positive MY", positive_my_dat.shape)
print("Negative MY", negative_my_dat.shape)

Positive EN (5000, 7)
Negative EN (5000, 6)
Positive MY (4972, 1)
Negative MY (4990, 3)


# Remove NAN

In [3]:
positive_en, negative_en, positive_my, negative_my = pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
positive_en['sentence'] = positive_en_dat[0]
negative_en['sentence'] = negative_en_dat[0]
positive_my['sentence'] = positive_my_dat[0]
negative_my['sentence'] = negative_my_dat[0]

# Combine data(Subjective=1,Objective=0)

In [4]:
positive_en['subjective'] = [1 for i in range(len(positive_en))]
negative_en['subjective'] = [0 for i in range(len(negative_en))]
positive_my['subjective'] = [1 for i in range(len(positive_my))]
negative_my['subjective'] = [0 for i in range(len(negative_my))]

positive_en['lang'] = ["en" for i in range(len(positive_en))]
negative_en['lang'] = ["en" for i in range(len(negative_en))]
positive_my['lang'] = ["my" for i in range(len(positive_my))]
negative_my['lang'] = ["my" for i in range(len(negative_my))]

In [5]:
data = positive_en.append(negative_en, ignore_index=True).append(positive_my, ignore_index=True).append(negative_my, ignore_index=True)

# Clean

In [6]:
# Stemmer code block copied from https://github.com/huseinzol05/Malaya/blob/master/session/emotion/multinomial.ipynb
permulaan = [
    'bel',
    'se',
    'ter',
    'men',
    'meng',
    'mem',
    'memper',
    'di',
    'pe',
    'me',
    'ke',
    'ber',
    'pen',
    'per',
]

hujung = ['kan', 'kah', 'lah', 'tah', 'nya', 'an', 'wan', 'wati', 'ita']

def naive_stemmer(word):
    assert isinstance(word, str), 'input must be a string'
    hujung_result = re.findall(r'^(.*?)(%s)$' % ('|'.join(hujung)), word)
    word = hujung_result[0][0] if len(hujung_result) else word
    permulaan_result = re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan[::-1])), word)
    permulaan_result.extend(re.findall(r'^(.*?)(%s)' % ('|'.join(permulaan)), word))
    mula = permulaan_result if len(permulaan_result) else ''
    if len(mula):
        mula = mula[1][1] if len(mula[1][1]) > len(mula[0][1]) else mula[0][1]
    return word.replace(mula, '')

In [7]:
stop_en = set(stopwords.words('english'))
stop_my = set(stopwords.words('indonesian'))
porter_stemmer = PorterStemmer()

def clean(sentence):
    cleaned = ""
    # lower, remove punctuation, stop words, lemmatize
    if detect(sentence)=="en":
        tokenized = [word for word in word_tokenize(sentence.lower().translate(str.maketrans('','',string.punctuation))) if word not in stop_en]
        for word in tokenized:
            cleaned += porter_stemmer.stem(word) + " "
        return cleaned
    else:
        tokenized = [word for word in word_tokenize(sentence.lower().translate(str.maketrans('','',string.punctuation))) if word not in stop_my]
        for word in tokenized:
            cleaned += naive_stemmer(word) + " "
        return cleaned

%time data['sentence'] = data['sentence'].apply(clean)

Wall time: 1min 11s


# Model

In [8]:
tfidf = TfidfVectorizer(ngram_range=(1,3),min_df=2).fit(data['sentence'])
vectors = tfidf.transform(data['sentence'])
vectors.shape

(19962, 31016)

In [9]:
train_X, test_X, train_Y, test_Y = train_test_split(vectors, data['subjective'], test_size = 0.2)

In [10]:
multinomial  = MultinomialNB().fit(train_X, train_Y)
print(
    metrics.classification_report(
        train_Y,
        multinomial.predict(train_X),
        target_names = ['objective','subjective'],
    )
)

              precision    recall  f1-score   support

   objective       0.97      0.95      0.96      8012
  subjective       0.95      0.97      0.96      7957

   micro avg       0.96      0.96      0.96     15969
   macro avg       0.96      0.96      0.96     15969
weighted avg       0.96      0.96      0.96     15969



# Prediction

In [11]:
def predict(text):
    """
    Will predict subjectivity after cleaning the text
    Output:
    Text
    Subjective: True/False  Confidence: int
    """
    if text.strip(): # whitespace/no text
        clean_text = tfidf.transform([clean(text)])
        print(text)
        print("Subjective:", True if multinomial.predict(clean_text)[0] else False, "\tConfidence :", multinomial.predict_proba(clean_text)[0].max())
        print()
    else:
        print("Please enter text to predict subjectivity.")

In [12]:
# test prediction on sample rows
import random
for i in range(5):
    text=data['sentence'][random.randint(0,len(data))] # get sentence from random row
    predict(text)
predict("this is a movie about a man and his drive to succeed")
predict("filem itu menerangkan seorang penyanyi yang mengalami kehidupan yang susah")

essenti film weak detail strong person 
Subjective: True 	Confidence : 0.9031010185616164

endear hear madam refer husband jacki make excel compani least selfconsci perform 
Subjective: True 	Confidence : 0.7632514331607971

marri that suppos 
Subjective: False 	Confidence : 0.9123177462247096

mafiosi amino gino ttimo lindung kafe glasgow miliki pupu scottishitali 
Subjective: False 	Confidence : 0.8723574576331053

graciou eloqu film end offer ray hope refuge abl look ahead resist live past forev lost 
Subjective: False 	Confidence : 0.5186475692854761

this is a movie about a man and his drive to succeed
Subjective: False 	Confidence : 0.5172754947302303

filem itu menerangkan seorang penyanyi yang mengalami kehidupan yang susah
Subjective: False 	Confidence : 0.5052014884137602



In [13]:
import pickle
with open('multinomial-subjectivity.pkl','wb') as fopen:
    pickle.dump(multinomial,fopen)
with open('tfidf-multinomial-subjectivity.pkl','wb') as fopen:
    pickle.dump(tfidf,fopen)