<a href="https://colab.research.google.com/github/Ninlawat-Ph/sentiment-analysis/blob/master/modelling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install googletrans
!pip install tqdm --upgrade
!pip install twython

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.45.0)


## Import Modules

In [0]:
# web scraping
import requests
from bs4 import BeautifulSoup
from time import time, sleep
from random import randint

# Translation
from googletrans import Translator

# Utilities
from tqdm import tqdm
import numpy as np
import pandas as pd

# NLP
from nltk import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import mark_negation

In [0]:
# restore data
data1= pd.read_csv("data_lanna_en.csv")
data2= pd.read_csv("data_siriraj_piyamaharajkarun_en.csv")
data3= pd.read_csv("data_vajira_hospital_en.csv")

## Sentence-level Extraction

In [0]:
import nltk
nltk.download('vader_lexicon')

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [0]:
def sent_level_polarity(df, target):
  sentences = []
  scores = []
  for i in df.index:
    en = df.loc[i, target]
    score = df.loc[i, "score"]
    s = sent_tokenize(en)
    sentences = sentences + s
    scores = scores + [score]*len(s)
  return pd.DataFrame({"sentences": sentences, "scores": scores})

In [0]:
def predict_sentiment(sentence):
  sa = SentimentIntensityAnalyzer()
  sentiment_distribution = sa.polarity_scores(sentence)
  score = sentiment_distribution.get("compound")
  
  if score >= 0.05:
    return "positive"
  elif score <= -0.05:
    return "negative"
  else:
    return "neutral"

In [0]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
data1_temp = sent_level_polarity(df=data1, target="en")
data1_temp["sentiment_polarity"] = data1_temp["sentences"].apply(lambda x: predict_sentiment(x))

In [0]:
data2_temp = sent_level_polarity(df=data2, target="en")
data2_temp["sentiment_polarity"] = data2_temp["sentences"].apply(lambda x: predict_sentiment(x))

In [0]:
data3_temp = sent_level_polarity(df=data3, target="en")
data3_temp["sentiment_polarity"] = data3_temp["sentences"].apply(lambda x: predict_sentiment(x))

## Exclude Neutral

In [0]:
data1_temp = data1_temp[data1_temp["sentiment_polarity"] != "neutral"].reset_index(drop=True)
data2_temp = data2_temp[data2_temp["sentiment_polarity"] != "neutral"].reset_index(drop=True)
data3_temp = data3_temp[data3_temp["sentiment_polarity"] != "neutral"].reset_index(drop=True)

## Label encoding the data

In [0]:
# label encoding the data 
from sklearn.preprocessing import LabelEncoder

In [0]:
le = LabelEncoder()

In [0]:
#encode it is 1 for positive and 0 for negative.
data1_temp['label']=le.fit_transform(data1_temp['sentiment_polarity'])
data2_temp['label']=le.fit_transform(data2_temp['sentiment_polarity'])
data3_temp['label']=le.fit_transform(data3_temp['sentiment_polarity'])

## Concat dataframe each hospital 

In [0]:
frames = [data1_temp, data2_temp, data3_temp]

In [0]:
data = pd.concat(frames)

In [0]:
data.head()

Unnamed: 0,sentences,scores,sentiment_polarity,label
0,You take very good care of check points too.,5,positive,1
1,It feels very good to work with nursing regula...,5,positive,1
2,A look tidy If relatives Or have a friend at w...,5,positive,1
3,Excellent patient care The singers all the time,5,positive,1
4,Doctors and nurses cared very good.,4,positive,1


In [0]:
data.shape

(963, 4)

## Text Preprocessing

In [0]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

###  Stopword removal 

In [0]:
sw = stopwords.words("english")
sw[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

## Create the new column of tokens

### tokenization

In [0]:
from nltk import sent_tokenize, word_tokenize
data["tokens"] = data["sentences"].apply(lambda x: 
                                        sum([word_tokenize(sentence) 
                                        for sentence in 
                                        sent_tokenize(x.lower())], []))

In [0]:
data["tokens"] = data["tokens"].apply(lambda x: list(set(x) - set(sw)))

In [0]:
data.head()

Unnamed: 0,sentences,scores,sentiment_polarity,label,tokens
0,You take very good care of check points too.,5,positive,1,"[points, ., care, good, check, take]"
1,It feels very good to work with nursing regula...,5,positive,1,"[nursing, regulation, ., work, good, feels]"
2,A look tidy If relatives Or have a friend at w...,5,positive,1,"[tidy, friend, ., work, look, would, hospital,..."
3,Excellent patient care The singers all the time,5,positive,1,"[singers, patient, care, excellent, time]"
4,Doctors and nurses cared very good.,4,positive,1,"[doctors, ., nurses, good, cared]"


## Vectorization

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation

###  TFIDF vectorization and LDA vectorization

In [0]:
data["prep_sentence"] = data["tokens"].apply(lambda x: " ".join(x))
corpus = data["prep_sentence"].tolist()
vectorizer = TfidfVectorizer()
X_tfidf = vectorizer.fit_transform(corpus)

In [0]:
lda = LatentDirichletAllocation(n_components=30)
X = lda.fit_transform(X_tfidf)

In [0]:
data.head()

Unnamed: 0,sentences,scores,sentiment_polarity,label,tokens,prep_sentence
0,You take very good care of check points too.,5,positive,1,"[points, ., care, good, check, take]",points . care good check take
1,It feels very good to work with nursing regula...,5,positive,1,"[nursing, regulation, ., work, good, feels]",nursing regulation . work good feels
2,A look tidy If relatives Or have a friend at w...,5,positive,1,"[tidy, friend, ., work, look, would, hospital,...",tidy friend . work look would hospital recomme...
3,Excellent patient care The singers all the time,5,positive,1,"[singers, patient, care, excellent, time]",singers patient care excellent time
4,Doctors and nurses cared very good.,4,positive,1,"[doctors, ., nurses, good, cared]",doctors . nurses good cared


In [0]:
data.to_csv("hospital_prediction.csv", index=False)

# Modeling 

### Import model

In [0]:
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate
from sklearn.metrics import classification_report, accuracy_score

In [0]:
y = data.label

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1, test_size= 0.2)

### Random Forest classifier

In [0]:
#Random Forest classifier
classifier = RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 0)
cv = cross_validate(classifier, X_train, y_train, cv=10)
print(cv['test_score'])
print(cv['test_score'].mean())


[0.76623377 0.75324675 0.75324675 0.74025974 0.75324675 0.77922078
 0.79220779 0.74025974 0.79220779 0.71428571]
0.7584415584415585


Random forests เป็นวิธีที่แม่นยำและมีประสิทธิภาพ ไม่ได้รับผลกระทบจากปัญหา overfitting  ไม่มี bias สามารถจัดการกับปัญหา missing value ได้

### XGBoost

In [0]:
# XGBoost
import xgboost as xgb
xgb=xgb.XGBClassifier()
cv = cross_validate(xgb, X_train, y_train, cv=10)
print(cv['test_score'])
print(cv['test_score'].mean())

[0.77922078 0.80519481 0.75324675 0.76623377 0.76623377 0.81818182
 0.77922078 0.79220779 0.77922078 0.76623377]
0.7805194805194805


XGBoost ย่อมาจาก Extreme Gradient Boosting ฐานมาจาก Gradient Boosting Machines  ใช้เทคนิค regularization เพื่อลดการ overfitting มีความเร็วในการประมวลผลที่สูง วิธีการคือ เอา Decision Tree มา train ต่อๆกันหลายๆ tree โดยที่แต่ละ decision tree จะเรียนรู้จาก error ของ tree ก่อนหน้า ทำให้ความแม่นยำของในการทำ prediction จะ แม่นยำขึ้น

### Naive Bayes Model

In [0]:
#Accuracy using Naive Bayes Model
NB = MultinomialNB()
cv = cross_validate(NB, X_train, y_train, cv=10)
print(cv['test_score'])
print(cv['test_score'].mean())

[0.80519481 0.80519481 0.80519481 0.79220779 0.79220779 0.79220779
 0.79220779 0.79220779 0.79220779 0.79220779]
0.7961038961038961


Naive Bayes หลักการของวิธีการนี้จะใช้การคำนวณความน่าจะเป็นเหมาะกับ dataset จำนวณมากและมี feature ที่ไม่ขึ้นต่อกันมีการจัดจำแนกประเภทอย่างง่ายประยุกต์ใช้กับจัดจำแนกประเภทข้อความ(text classification) model ไม่ซับซ้อน  

# Pick XGboost model

In [0]:
# XGBoost
import xgboost as xgb
xgb=xgb.XGBClassifier()
xgb.fit(X_train,y_train)
preds2=xgb.predict(X_test)
print(classification_report(preds2,y_test))

              precision    recall  f1-score   support

           0       0.04      0.40      0.08         5
           1       0.98      0.76      0.85       188

    accuracy                           0.75       193
   macro avg       0.51      0.58      0.46       193
weighted avg       0.96      0.75      0.83       193



In [0]:
print('Confusion Matrix: ',metrics.confusion_matrix(y_test,preds2), sep = '\n')

Confusion Matrix: 
[[  2  46]
 [  3 142]]
