# Step 1: Web Scraper

### Use `selenium` to grab the data
The first step is to scrape the TEXT data from the website, here we use `selenium` to be the automatic web driver to grab the data.

----


In [23]:
import pandas as pd
import numpy as np
# web scraper
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import lxml.html
import re
import time
# write to csv file
import csv
# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

# -------------------------------------------------------------------------------
# -------------------------------------------------------------------------------

url = 'https://eur.shein.com/category/Shoes-Bags-Accs-sc-00828516.html?ici=eur_tab01navbar09&scici=navbar_WomenHomePage~~tab01navbar09~~9~~webLink~~~~0&srctype=category&userpath=category%3ESHOES-ACCESSORIES'

# to avoid opening browser while using selenium
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(ChromeDriverManager().install(),options=option)

driver.get(url)
time.sleep(3)

# get source code -- type: str
html_source = driver.page_source

# key
html = lxml.html.fromstring(html_source)

# obtain all the text under the 'div' tags
items = html.xpath("//div//text()")

pattern = re.compile("^\s+|\s+$|\n")

clause_text = ""

for item in items:
    line = re.sub(pattern, "", item)
    if len(item) > 1:
        clause_text += line +"\n"

driver.quit()



# -------------------------------------------------------------------------------
# -------------------------------------------------------------------------------

#  Initial Filtering and Generate CSV
# (1) Only keep the content have the number of words ranging from 2 to 20.
# (2) Ignore the content beginning with the characters in the ignore string list, such as "{", ".", ";" and so on.
# (3) Save the filtered content into one column of the created csv file.


raw_text = clause_text

# the beginning character of the content, which is the sign we should ignore the content
ignore_str = ',.;{}#?/!()@$'

# the content we are going to keep to send to models.
content_list = []

# only keep the content that has words count from 2 to 50 (includes).
for line in raw_text.split('\n'):
    if 1<len(line.split())<=20 and line[0] not in ignore_str:
        content_list.append([line])


header = ['content']

# create a csv file to save the filtered content for later model analysis.
with open('Raw/shein-4.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    writer.writerows(content_list)
        

# -------------------------------------------------------------------------------
# -------------------------------------------------------------------------------

# Step 2: Checking Presence

## Use pretrained presence model to check the presence in the filtered content
#(1) Load the pretrained model and countvectorizer.
#(2) Transform the content using the countvectorizer.
#(3) Use the model for presence prediction on the content.

# Loading the saved model with joblib
presence_model = joblib.load('rf_presence_classifier.joblib')
presence_cv = joblib.load('presence_CountVectorizer.joblib')

# New dataset to predict
presence_pred = pd.read_csv('Raw/shein-4.csv')

# apply the pretrained model to the new content data
pre_pred_vec = presence_model.predict(presence_cv.transform(presence_pred['content']))

presence_pred['presence'] = pre_pred_vec.tolist()

# dark pattern content are those where the predicted result equals to 0.
dark = presence_pred.loc[presence_pred['presence']==0]

dark.to_csv('DP/shein-4.csv', index=False, header=True)



Current google-chrome version is 95.0.4638
Get LATEST driver version for 95.0.4638
Driver [/Users/zenglan/.wdm/drivers/chromedriver/mac64/95.0.4638.54/chromedriver] found in cache
