In [26]:
import pandas as pd
import numpy as np 
import seaborn as sns
sns.set(style='darkgrid', font_scale=1.4)
from pythainlp.tokenize import word_tokenize
import emoji
import re
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from visualize import top_feats_all, plot_top_feats
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score

In [27]:
df = pd.read_csv('sentiment_analysis.csv')
df = df.drop(df.columns[0], axis=1)
df

Unnamed: 0,comments,target
0,ดีค่ะ วิทยากรมีความรู้ดีมาก พูดเข้าใจ การสอนมี...,pos
1,ชอบมาก เหมาะสมหรับผู้ที่มีผู้ฐานเรื่องการตลาด ...,pos
2,คอร์สเนื้อหาแน่น และผู้สอนก็สอนได้ละเอียด มีขั...,pos
3,คอร์สนี้ช่วยให้หลักการและกรอบในการดำเนินงานด้า...,pos
4,สอนแบบมี Logic และ วิธีการที่ชัดเจน ชอบมากค่ะ,pos
...,...,...
941,Good,pos
942,อันนี้รีวิว หลังจากดูจบแล้วครับ\nเกรินก่อน ผมเ...,pos
943,อ.บอย สอนได้ละเอียดมากครับ\nแต่ถ้าเอาไปเขียนเอ...,pos
944,สอนได้ละเอียดกระชับและชัดเจน เข้าใจได้ง่ายครับ...,pos


In [28]:
def replace_url(text):
    URL_PATTERN = r"""(?i)\b((?:https?:(?:/{1,3}|[a-z0-9%])|[a-z0-9.\-]+[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)/)(?:[^\s()<>{}\[\]]+|\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\))+(?:\([^\s()]*?\([^\s()]+\)[^\s()]*?\)|\([^\s]+?\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’])|(?:(?<!@)[a-z0-9]+(?:[.\-][a-z0-9]+)*[.](?:com|net|org|edu|gov|mil|aero|asia|biz|cat|coop|info|int|jobs|mobi|museum|name|post|pro|tel|travel|xxx|ac|ad|ae|af|ag|ai|al|am|an|ao|aq|ar|as|at|au|aw|ax|az|ba|bb|bd|be|bf|bg|bh|bi|bj|bm|bn|bo|br|bs|bt|bv|bw|by|bz|ca|cc|cd|cf|cg|ch|ci|ck|cl|cm|cn|co|cr|cs|cu|cv|cx|cy|cz|dd|de|dj|dk|dm|do|dz|ec|ee|eg|eh|er|es|et|eu|fi|fj|fk|fm|fo|fr|ga|gb|gd|ge|gf|gg|gh|gi|gl|gm|gn|gp|gq|gr|gs|gt|gu|gw|gy|hk|hm|hn|hr|ht|hu|id|ie|il|im|in|io|iq|ir|is|it|je|jm|jo|jp|ke|kg|kh|ki|km|kn|kp|kr|kw|ky|kz|la|lb|lc|li|lk|lr|ls|lt|lu|lv|ly|ma|mc|md|me|mg|mh|mk|ml|mm|mn|mo|mp|mq|mr|ms|mt|mu|mv|mw|mx|my|mz|na|nc|ne|nf|ng|ni|nl|no|np|nr|nu|nz|om|pa|pe|pf|pg|ph|pk|pl|pm|pn|pr|ps|pt|pw|py|qa|re|ro|rs|ru|rw|sa|sb|sc|sd|se|sg|sh|si|sj|Ja|sk|sl|sm|sn|so|sr|ss|st|su|sv|sx|sy|sz|tc|td|tf|tg|th|tj|tk|tl|tm|tn|to|tp|tr|tt|tv|tw|tz|ua|ug|uk|us|uy|uz|va|vc|ve|vg|vi|vn|vu|wf|ws|ye|yt|yu|za|zm|zw)\b/?(?!@)))"""
    return re.sub(URL_PATTERN, 'xxurl', text)

def ungroup_emoji(toks):
    res = []
    for tok in toks:
        if emoji.emoji_count(tok) == len(tok):
            for char in tok:
                res.append(char)
        else:
            res.append(tok)
    return res

def process_text(text):
    res = text.lower().strip() 
    res = replace_url(text)
    res = [word for word in word_tokenize(text) if word and not re.search(pattern=r"\s+", string=word)]
    res = ungroup_emoji(res)
    return res


In [29]:
tfidf = TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2), min_df=20, sublinear_tf=True)
lr = LogisticRegression()

def lr_cv(splits, X, Y, pipeline, average_method):
    
    kfold = StratifiedKFold(n_splits=splits, shuffle=True, random_state=777)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for train, test in kfold.split(X, Y):
        lr_fit = pipeline.fit(X[train], Y[train])
        prediction = lr_fit.predict(X[test])
        scores = lr_fit.score(X[test],Y[test])
        
        accuracy.append(scores * 100)
        precision.append(precision_score(Y[test], prediction, average=average_method)*100)
        print('              negative  positive')
        print('precision:',precision_score(Y[test], prediction, average=None))
        recall.append(recall_score(Y[test], prediction, average=average_method)*100)
        print('recall:   ',recall_score(Y[test], prediction, average=None))
        f1.append(f1_score(Y[test], prediction, average=average_method)*100)
        print('f1 score: ',f1_score(Y[test], prediction, average=None))
        print('-'*50)
    
    print("accuracy: %.2f%% (+/- %.2f%%)" % (np.mean(accuracy), np.std(accuracy)))
    print("precision: %.2f%% (+/- %.2f%%)" % (np.mean(precision), np.std(precision)))
    print("recall: %.2f%% (+/- %.2f%%)" % (np.mean(recall), np.std(recall)))
    print("f1 score: %.2f%% (+/- %.2f%%)" % (np.mean(f1), np.std(f1)))

In [30]:
from sklearn.pipeline import Pipeline
original_pipeline = Pipeline([
    ('vectorizer', tfidf),
    ('classifier', lr)
])
lr_cv(5, df.comments, df.target, original_pipeline, 'macro')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              negative  positive
precision: [0.         0.91052632]
recall:    [0. 1.]
f1 score:  [0.         0.95316804]
--------------------------------------------------
              negative  positive
precision: [0.         0.91534392]
recall:    [0. 1.]
f1 score:  [0.        0.9558011]
--------------------------------------------------


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              negative  positive
precision: [0.         0.91534392]
recall:    [0. 1.]
f1 score:  [0.        0.9558011]
--------------------------------------------------
              negative  positive
precision: [0.         0.91534392]
recall:    [0. 1.]
f1 score:  [0.        0.9558011]
--------------------------------------------------
              negative  positive
precision: [0.         0.91534392]
recall:    [0. 1.]
f1 score:  [0.        0.9558011]
--------------------------------------------------
accuracy: 91.44% (+/- 0.19%)
precision: 45.72% (+/- 0.10%)
recall: 50.00% (+/- 0.00%)
f1 score: 47.76% (+/- 0.05%)


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
from imblearn.pipeline import make_pipeline
from imblearn.over_sampling import ADASYN, SMOTE, RandomOverSampler
ROS_pipeline = make_pipeline(tfidf, RandomOverSampler(random_state=777),lr)
SMOTE_pipeline = make_pipeline(tfidf, SMOTE(random_state=777),lr)

In [32]:
tfidf = TfidfVectorizer(tokenizer=process_text, ngram_range=(1,2), min_df=20, sublinear_tf=True)
tfidf_fit = tfidf.fit_transform(df['comments'])
ros = RandomOverSampler(random_state=777)
X_ROS, y_ROS = ros.fit_resample(tfidf_fit, df['target'])
pd.DataFrame(tfidf_fit.todense(), columns=tfidf.get_feature_names())



Unnamed: 0,data,กว่า,กับ,การ,การ ทำ,การทำงาน,การสอน,การใช้งาน,ก็,ก่อน,...,ได้,ได้ ความรู้,ได้ ง่าย,ได้ จริง,ได้ดี,ไป,ไม่,ไม่ มี,ไม่ ได้,ๆ
0,0.0,0.0,0.000000,0.078014,0.0,0.0,0.126959,0.0,0.000000,0.00000,...,0.113525,0.1241,0.127980,0.0,0.0,0.132088,0.158103,0.1241,0.0,0.077149
1,0.0,0.0,0.000000,0.137349,0.0,0.0,0.000000,0.0,0.000000,0.00000,...,0.000000,0.0000,0.000000,0.0,0.0,0.000000,0.132637,0.0000,0.0,0.135827
2,0.0,0.0,0.000000,0.000000,0.0,0.0,0.359476,0.0,0.290104,0.00000,...,0.153168,0.0000,0.000000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.000000
3,0.0,0.0,0.000000,0.219610,0.0,0.0,0.000000,0.0,0.000000,0.00000,...,0.152279,0.0000,0.000000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.000000
4,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.00000,...,0.000000,0.0000,0.000000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
941,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.00000,...,0.000000,0.0000,0.000000,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.000000
942,0.0,0.0,0.115293,0.089900,0.0,0.0,0.000000,0.0,0.199908,0.13242,...,0.062337,0.0000,0.000000,0.0,0.0,0.089900,0.000000,0.0000,0.0,0.000000
943,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.00000,...,0.136721,0.0000,0.000000,0.0,0.0,0.197172,0.000000,0.0000,0.0,0.000000
944,0.0,0.0,0.000000,0.000000,0.0,0.0,0.272205,0.0,0.000000,0.00000,...,0.196376,0.0000,0.274395,0.0,0.0,0.000000,0.000000,0.0000,0.0,0.000000


In [33]:
pd.DataFrame(X_ROS.todense(), columns=tfidf.get_feature_names())



Unnamed: 0,data,กว่า,กับ,การ,การ ทำ,การทำงาน,การสอน,การใช้งาน,ก็,ก่อน,...,ได้,ได้ ความรู้,ได้ ง่าย,ได้ จริง,ได้ดี,ไป,ไม่,ไม่ มี,ไม่ ได้,ๆ
0,0.0,0.0,0.000000,0.078014,0.0,0.0,0.126959,0.0,0.000000,0.0,...,0.113525,0.1241,0.12798,0.0,0.0,0.132088,0.158103,0.124100,0.0,0.077149
1,0.0,0.0,0.000000,0.137349,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0000,0.00000,0.0,0.0,0.000000,0.132637,0.000000,0.0,0.135827
2,0.0,0.0,0.000000,0.000000,0.0,0.0,0.359476,0.0,0.290104,0.0,...,0.153168,0.0000,0.00000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
3,0.0,0.0,0.000000,0.219610,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.152279,0.0000,0.00000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
4,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0000,0.00000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1725,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0000,0.00000,0.0,0.0,0.000000,0.344662,0.000000,0.0,0.000000
1726,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0000,0.00000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000
1727,0.0,0.0,0.156388,0.206468,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0000,0.00000,0.0,0.0,0.000000,0.117760,0.000000,0.0,0.120593
1728,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000,0.0,0.000000,0.0,...,0.000000,0.0000,0.00000,0.0,0.0,0.000000,0.208227,0.343006,0.0,0.000000


In [34]:
lr_cv(5, df.comments, df.target, ROS_pipeline, 'macro')

              negative  positive
precision: [0.275 0.96 ]
recall:    [0.64705882 0.83236994]
f1 score:  [0.38596491 0.89164087]
--------------------------------------------------
              negative  positive
precision: [0.25806452 0.94936709]
recall:    [0.5        0.86705202]
f1 score:  [0.34042553 0.90634441]
--------------------------------------------------
              negative  positive
precision: [0.36666667 0.96855346]
recall:    [0.6875     0.89017341]
f1 score:  [0.47826087 0.92771084]
--------------------------------------------------
              negative  positive
precision: [0.31428571 0.96753247]
recall:    [0.6875     0.86127168]
f1 score:  [0.43137255 0.91131498]
--------------------------------------------------
              negative  positive
precision: [0.22857143 0.94805195]
recall:    [0.5        0.84393064]
f1 score:  [0.31372549 0.89296636]
--------------------------------------------------
accuracy: 83.72% (+/- 2.16%)
precision: 62.36% (+/- 2.80%)
recall

In [35]:
lr_cv(5, df.comments, df.target, SMOTE_pipeline, 'macro')

              negative  positive
precision: [0.3        0.96666667]
recall:    [0.70588235 0.83815029]
f1 score:  [0.42105263 0.89783282]
--------------------------------------------------
              negative  positive
precision: [0.18604651 0.94520548]
recall:    [0.5        0.79768786]
f1 score:  [0.27118644 0.86520376]
--------------------------------------------------
              negative  positive
precision: [0.30769231 0.97333333]
recall:    [0.75       0.84393064]
f1 score:  [0.43636364 0.90402477]
--------------------------------------------------
              negative  positive
precision: [0.17073171 0.93918919]
recall:    [0.4375     0.80346821]
f1 score:  [0.24561404 0.86604361]
--------------------------------------------------
              negative  positive
precision: [0.1875     0.95035461]
recall:    [0.5625     0.77456647]
f1 score:  [0.28125    0.85350318]
--------------------------------------------------
accuracy: 79.28% (+/- 3.20%)
precision: 59.27% (+/- 3.6

In [36]:
from imblearn.under_sampling import NearMiss, RandomUnderSampler
RUS_pipeline = make_pipeline(tfidf, RandomUnderSampler(random_state=777),lr)
NM1_pipeline = make_pipeline(tfidf, NearMiss(),lr)


In [37]:
lr_cv(5, df.comments, df.target, RUS_pipeline, 'macro')

              negative  positive
precision: [0.23728814 0.97709924]
recall:    [0.82352941 0.73988439]
f1 score:  [0.36842105 0.84210526]
--------------------------------------------------
              negative  positive
precision: [0.17741935 0.96062992]
recall:    [0.6875     0.70520231]
f1 score:  [0.28205128 0.81333333]
--------------------------------------------------
              negative  positive
precision: [0.20689655 0.96946565]
recall:    [0.75       0.73410405]
f1 score:  [0.32432432 0.83552632]
--------------------------------------------------
              negative  positive
precision: [0.20895522 0.98360656]
recall:    [0.875      0.69364162]
f1 score:  [0.3373494  0.81355932]
--------------------------------------------------
              negative  positive
precision: [0.19607843 0.95652174]
recall:    [0.625      0.76300578]
f1 score:  [0.29850746 0.8488746 ]
--------------------------------------------------
accuracy: 72.94% (+/- 1.96%)
precision: 58.74% (+/- 1.3

In [38]:
lr_cv(5, df.comments, df.target, NM1_pipeline, 'macro')

              negative  positive
precision: [0.12087912 0.93939394]
recall:    [0.64705882 0.53757225]
f1 score:  [0.2037037  0.68382353]
--------------------------------------------------
              negative  positive
precision: [0.13392857 0.98701299]
recall:    [0.9375     0.43930636]
f1 score:  [0.234375 0.608   ]
--------------------------------------------------
              negative  positive
precision: [0.11926606 0.9625    ]
recall:    [0.8125     0.44508671]
f1 score:  [0.208      0.60869565]
--------------------------------------------------
              negative  positive
precision: [0.13043478 0.98648649]
recall:    [0.9375     0.42196532]
f1 score:  [0.22900763 0.59109312]
--------------------------------------------------
              negative  positive
precision: [0.11340206 0.94565217]
recall:    [0.6875     0.50289017]
f1 score:  [0.19469027 0.65660377]
--------------------------------------------------
accuracy: 49.78% (+/- 3.05%)
precision: 54.39% (+/- 1.34%)


In [53]:
result = {'method': ['origianl imbalanced data','RandomOverSampler','SMOTE','RandomUnderSampler','NaerMiss'],'accuracy': ['91.44% (+/- 0.19%)','83.72% (+/- 2.16%)','79.28% (+/- 3.20%)','72.94% (+/- 1.96%)','49.78% (+/- 3.05%)'],'precision':['45.72% (+/- 0.10%)','62.36% (+/- 2.80%)','59.27% (+/- 3.65%)','58.74% (+/- 1.37%)','54.39% (+/- 1.34%)'],'recall':['50.00% (+/- 0.00%)','73.17% (+/- 4.70%)','70.14% (+/- 7.00%)','73.97% (+/- 3.93%)','63.69% (+/- 4.07%)'],'f1 score':['47.76% (+/- 0.05%)','64.80% (+/- 3.54%)','60.42% (+/- 4.98%)','57.64% (+/- 1.83%)','42.18% (+/- 1.28%)']}
result = pd.DataFrame(result)
result.set_index('method')
#sns.heatmap(result, annot=True)

Unnamed: 0_level_0,accuracy,precision,recall,f1 score
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
origianl imbalanced data,91.44% (+/- 0.19%),45.72% (+/- 0.10%),50.00% (+/- 0.00%),47.76% (+/- 0.05%)
RandomOverSampler,83.72% (+/- 2.16%),62.36% (+/- 2.80%),73.17% (+/- 4.70%),64.80% (+/- 3.54%)
SMOTE,79.28% (+/- 3.20%),59.27% (+/- 3.65%),70.14% (+/- 7.00%),60.42% (+/- 4.98%)
RandomUnderSampler,72.94% (+/- 1.96%),58.74% (+/- 1.37%),73.97% (+/- 3.93%),57.64% (+/- 1.83%)
NaerMiss,49.78% (+/- 3.05%),54.39% (+/- 1.34%),63.69% (+/- 4.07%),42.18% (+/- 1.28%)
