<a href="https://colab.research.google.com/github/Ninlawat-Ph/sentiment-analysis/blob/master/Get_data_lanna_hospital.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install googletrans
!pip install tqdm --upgrade
!pip install twython

Requirement already up-to-date: tqdm in /usr/local/lib/python3.6/dist-packages (4.45.0)


## Import Modules

In [0]:
# web scraping
import requests
from bs4 import BeautifulSoup
from time import time, sleep
from random import randint

# Translation
from googletrans import Translator

# Utilities
from tqdm import tqdm
import numpy as np
import pandas as pd

# NLP
from nltk import sent_tokenize, word_tokenize
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import mark_negation

In [0]:
# Download resources
import nltk

nltk.download("punkt")
nltk.download("vader_lexicon")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [0]:
def honestdoc_comment(url):
    """
    This function is to scrap data from a webboard (https://www.honestdocs.com).

    INPUT
    url : String
      URL of the target website
    
    OUTPUT
    comment : List
      List of comments
    score : List
      List of rating score
    """
    #create connection
    data = requests.get(url)
    print("requests code : {}".format(data.status_code)) 
    print("note\n2xx: success\n4xx, 5xx: error")
    
    #scrape comment and score
    start_time = time() #start scraping data from page1
    r = requests.get(url, params=dict(query="web scraping",page=1)) 
    soup = BeautifulSoup(r.text,"html.parser")
    n = len(soup.find_all("div",{"class":"comments__content"})) #find n of items in the page
    
    #extract each item
    comment = [soup.find_all("div",
                             {"class":"comments__content"})[i].get_text().strip() for i in range(0,n)]
    score = [soup.find_all("span",
                           {"class":"stars star-rating"})[i].attrs["data-score"] for i in range(0,n)]
    elapsed_time = time() - start_time #finish scraping data from page1
    print("Time used for scraping data from page - 1 : {} s".format(elapsed_time))
    sleep(randint(1,3)) #mimic human behavior
           
    p = 2 #start scraping data from page2
    while n > 0: #until the number of items in a page = 0
        start_time = time() 
        r = requests.get(url, params=dict(query="web scraping",page=p))
        soup = BeautifulSoup(r.text,"html.parser")
        n = len(soup.find_all("div",{"class":"comments__content"}))
        [comment.append(soup.find_all("div",
                                      {"class":"comments__content"})[i].get_text().strip()) for i in range(0,n)]
        [score.append(soup.find_all("span",
                                    {"class":"stars star-rating"})[i].attrs["data-score"]) for i in range(0,n)]
        elapsed_time = time() - start_time
        print("Time used for scraping data from page - {} : {} s".format(p, elapsed_time))
        p +=1
        sleep(randint(1,3))
    
    #backup data 
    pd.DataFrame({"comment": comment, 
                  "score": score}).to_csv("comment_"+str(url[url.rfind("/")+1:]) + ".csv", index=False)
    
    return comment, score

## Get data

In [0]:
nts, scores = honestdoc_comment(r"https://www.honestdocs.co/hospitals/lanna-hospital")

requests code : 200
note
2xx: success
4xx, 5xx: error
Time used for scraping data from page - 1 : 1.2567455768585205 s
Time used for scraping data from page - 2 : 1.235698938369751 s
Time used for scraping data from page - 3 : 1.230867862701416 s
Time used for scraping data from page - 4 : 1.2272541522979736 s
Time used for scraping data from page - 5 : 1.2355847358703613 s
Time used for scraping data from page - 6 : 1.235170602798462 s
Time used for scraping data from page - 7 : 1.2453463077545166 s
Time used for scraping data from page - 8 : 1.3068902492523193 s
Time used for scraping data from page - 9 : 1.2156188488006592 s
Time used for scraping data from page - 10 : 1.243983507156372 s
Time used for scraping data from page - 11 : 1.2221341133117676 s
Time used for scraping data from page - 12 : 1.2345731258392334 s
Time used for scraping data from page - 13 : 1.236027479171753 s
Time used for scraping data from page - 14 : 1.2204232215881348 s
Time used for scraping data from pag

## Comment Translation The comments in this study are either Thai or English. Therefore, they should be standardized as English. Google translate API is the tool in this study AND Remove emoji in string

In [0]:
# restore data
data_comment_lanna_hospital = pd.read_csv("comment_lanna-hospital.csv")

### Remove emoji

In [0]:
#transform emoji to sting
import re
import sys
def remove_emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

In [0]:
tqdm.pandas()
data_comment_lanna_hospital["non_emojis"] = data_comment_lanna_hospital.progress_apply(lambda x: remove_emoji(x["comment"]), axis=1)

100%|██████████| 46/46 [00:00<00:00, 3928.77it/s]


### Translate en to thai

In [0]:

def th2en(comment):
  return Translator().translate(comment, src="th", dest="en").text

In [0]:
data_comment_lanna_hospital["en"] = data_comment_lanna_hospital.progress_apply(lambda x: th2en(x["non_emojis"]), axis=1)

100%|██████████| 46/46 [00:05<00:00,  7.83it/s]


In [0]:
data_comment_lanna_hospital.head()

Unnamed: 0,comment,score,non_emojis,en
0,คุณหมอดูแลดีมากเลยค่ะ ตรวจละเอียด รักษาได้ตรงจ...,5,คุณหมอดูแลดีมากเลยค่ะ ตรวจละเอียด รักษาได้ตรงจ...,You take very good care of check points too. M...
1,คุณหมอและพยาบาลใส่ใจดูแลดีมากคะ​ ประทับใจทุกคร...,4,คุณหมอและพยาบาลใส่ใจดูแลดีมากคะ​ ประทับใจทุกคร...,Doctors and nurses cared very good. Impressed ...
2,โดนแมวที่เลี้ยงไว้กัดคะ ได้ไปทำแผลที่ รพ.นี้ เ...,4,โดนแมวที่เลี้ยงไว้กัดคะ ได้ไปทำแผลที่ รพ.นี้ เ...,A cat that bites you. Do not wound the hospita...
3,มีอาการกรดไหลย้อน ทำให้หายใจไม่ออก และเหนื่อยม...,5,มีอาการกรดไหลย้อน ทำให้หายใจไม่ออก และเหนื่อยม...,Symptoms of GERD Asphyxiate And tired I can no...
4,มีประกันสังคมที่ โรงพยาบาลลานนาเชียงใหม่ค่ะประ...,5,มีประกันสังคมที่ โรงพยาบาลลานนาเชียงใหม่ค่ะประ...,With Social Security Hospital Lanna me very im...


### Restore data to csv file

In [0]:
data_comment_lanna_hospital.to_csv("data_lanna_en.csv", index=False)