In [1]:
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import StaleElementReferenceException, WebDriverException, NoSuchElementException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import time
from tqdm import tqdm
import sqlite3
from sqlite3 import Error
import os
import random
from random import randrange
import pandas as pd
import numpy as np
import datetime
from datetime import timedelta
import xlsxwriter

In [2]:
# Delete multiple words from text
def replaceMultiple(mainString, toBeReplaces, newString):
    # Iterate over the strings to be replaced
    for elem in toBeReplaces :
        # Check if string is in the main string
        if elem in mainString :
            # Replace the string
            mainString = mainString.replace(elem, newString)
    
    return  mainString

In [3]:
# paging
def paging(driver, num_pg, wait_sec):
    next_page = str(num_pg + 1)
    nextHref = '"' + "power_review_page("+"'" +next_page+"'"+");" + '"'
    try:
        xp = driver.find_element_by_xpath("//a[contains(@href,%s)]"%nextHref)
        time.sleep(wait_sec)
    except NoSuchElementException:
        time.sleep(wait_sec)
        nextHref = '"' + "power_review_page("+"'" +next_page+"'"+");" + '"'
        xp = driver.find_element_by_xpath("//a[contains(@href,%s)]"%nextHref)
        time.sleep(wait_sec)
    try:
        driver.execute_script("arguments[0].click();", xp)
    except StaleElementReferenceException:
        xp = driver.find_element_by_xpath("//a[contains(@href,%s)]"%nextHref)
        driver.execute_script("arguments[0].click();", xp)
        
#     print(str(num_pg) + " pg is done")

In [4]:
# Scraping Reviews
def reviewScraper(driver):
    #scraping
    elem = driver.find_element_by_id('listPowerReview')
    reviews = elem.text.split('이 리뷰가 도움이 되셨나요?')

    final=[]
    word_to_delete = ['\n','★','작성자 :','작성자 : 네이버***','조회수 :','0','1','2','3','4','5','6','7','8','9','...▼ 더보기']  
    for review in reviews:
        text = replaceMultiple(review, word_to_delete, "").strip()
        if (text != '') and ('미흡' not in text) and ('불만족' not in text):
            text = text.split('*')[-1].strip()
            text = replaceMultiple(text, ['아주만족','만족','보통'],'')
            final.append(text)
            
    return final

In [5]:
def createDatabase(dbpath):
    filepath = './' + dbpath
    if os.path.exists(filepath) == False:
        try:
            conn = sqlite3.connect(dbpath)
        except Error as e:
            print(e)
        finally:
            conn.close()

In [6]:
def createContentsTable():
    # connect to sqlite DB
    dbpath = 'reviewDB.sqlite'
    conn = sqlite3.connect(dbpath)

    # create a table
    cur = conn.cursor()
    cur.executescript('''
    /* create a table if not exists already */
    CREATE TABLE IF NOT EXISTS contents (
        ID INTEGER PRIMARY KEY,
        URL VARCHAR(225) UNIQUE, -- only one url exist
        CATEGORY VARCHAR(225),
        INPUT_DATE DATE,
        CONTRIBUTOR VARCHAR(225))
    ''')

    # commit to DB
    conn.commit()

In [7]:
def randomNameCreator(num_names):
    random_name_list = []
    characters_eng = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']   
    for i in range(num_names):
        random_name = ''.join(random.choices(characters_eng, k=4)) + '****'
        random_name_list.append(random_name)

    return random_name_list

In [8]:
def randomStarCreator(num_stars, min_stars=4, max_stars=5):
    random_stars_list = []
    for i in range(num_stars):
        stars = random.randint(min_stars, max_stars)
        random_stars_list.append(stars)

    return random_stars_list

In [9]:
def randomDateCreator(num_dates, start_date, end_date):
    random_dates_list = []
    date_format = '%Y-%m-%d %H:%M'
    delta = end_date - start_date
    int_delta = (delta.days * 24 * 60 * 60) + delta.seconds
    for i in range(num_dates):
        random_second = randrange(int_delta)
        final_date = start_date + timedelta(seconds=random_second)
        final_date = final_date.strftime(date_format)
        random_dates_list.append(final_date)
    return random_dates_list

In [10]:
# main function
baseUrl = 'http://www.bodybogam.com/shop/shopdetail.html?branduid=2127765&ref=naver_open&NaPm=ct=jts5nneo%7Cci=0yi00028inLqLWK6gv09%7Ctr=pla%7Chk=07345664b7e61e570fd6c9af18bfe3de16e3565b#listPowerReview|page=1|list_sort=|list_term=1'
driver = webdriver.Chrome('./chromedriver')
driver.get(baseUrl)

total_final = []
num_last_pg = 59

for num_pg in tqdm(range(1, num_last_pg)):
    total_final.append(reviewScraper(driver))
    paging(driver, num_pg, 0.25)
    
total_final = [review for reviews in total_final for review in reviews] # double for loop list comprehensi
    
driver.quit()

random_name_list = randomNameCreator(len(total_final))
random_stars_list = randomStarCreator(len(total_final))
start_date = datetime.datetime(2019, 3, 1, 10)
end_date = datetime.datetime(2019, 4, 3, 22)
random_dates_list = randomDateCreator(len(total_final), start_date, end_date)

In [11]:
datas = {
#     '상품번호' : [],
#     '옵션' : [],
    '제목': total_final,
    '내용(HTML)' : total_final,
#     '이미지 파일명' : [],
    '작성자' : random_name_list,
    '작성시간' : random_dates_list,
    '평점' : random_stars_list
}
columns = ['상품번호', ' 옵션', '제목', '내용(HTML)', '이미지 파일명', '작성자', '작성시간', '평점']
datas = pd.DataFrame(datas, columns=columns)
file_name = 'review_sample.xlsx'
datas.to_excel(file_name)