In [1]:
import pandas as pd
import numpy as np

# web scraper

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import lxml.html
import re
import time

# write to csv file

import csv

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib

import tensorflow as tf
from tensorflow import keras
#!pip install h5py pyyaml

from keras.preprocessing.sequence import pad_sequences

In [2]:
url = 'https://outfithustler.com/collections/women-fashion?gclid=EAIaIQobChMIx_r5nM_o8QIVKYBQBh3fGwWvEAAYAiAAEgJYEvD_BwE&page=1'
#url='https://www.currys.ie/ieen/search-keywords/xx_xx_xx_xx_xx/-wk22_headphones_ie-/xx-criteria.html'

# to avoid opening browser while using selenium
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(ChromeDriverManager().install(),options=option)

driver.get(url)
time.sleep(1)

# get source code -- type: str
html_source = driver.page_source

# key
html = lxml.html.fromstring(html_source)

# obtain all the text under the 'div' tags
items = html.xpath("//div//text()")

pattern = re.compile("^\s+|\s+$|\n")

clause_text = ""

for item in items:
    line = re.sub(pattern, "", item)
    if len(item) > 1:
        clause_text += line +"\n"

driver.quit()



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Get LATEST driver version for 94.0.4606
Trying to download new driver from https://chromedriver.storage.googleapis.com/94.0.4606.61/chromedriver_mac64.zip
Driver has been saved in cache [/Users/zenglan/.wdm/drivers/chromedriver/mac64/94.0.4606.61]


In [3]:
raw_text = clause_text

# the beginning character of the content, which is the sign we should ignore the content
ignore_str = ',.;{}?#/'

# the content we are going to keep to send to models.
content_list = []

# only keep the content that has words count from 2 to 50 (includes).
for line in raw_text.split('\n'):
    if 1<len(line.split())<=50 and line[0] not in ignore_str:
        content_list.append([line])

In [4]:
header = ['content']

# create a csv file to save the filtered content for later model analysis.
with open('webscrap01.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    writer.writerows(content_list)

## Pre-processing Testing Dataset
-------

In [5]:
# New dataset to predict
data = pd.read_csv('webscrap01.csv')

In [6]:
data.head(10)

Unnamed: 0,content
0,Don’t wait until
1,Black Friday!
2,Get Black Friday deals all year round straight...
3,"Never miss a deal, sign up now. 💌"
4,Women's Fashion
5,Best Selling
6,Lowest Price
7,Highest Price
8,"Date, New to Old"
9,Refine By


------
### Filtering

In [7]:
# Drop NA rows
data.dropna(subset=['content'],inplace=True)

# Keep rows with word count between 1 and 45 (inclusive)
data = data[data['content'].str.split().str.len() > 1]
data = data[data['content'].str.split().str.len() < 46]

# Remove the rows where the first letter starting with ignoring characters
ignore_str = [',', '.', ';', '{', '}', '#', '/', '?', '@']
data = data[~data['content'].str[0].isin(ignore_str)]

data['content'] = data['content'].str.lower()


data.head(10)

Unnamed: 0,content
0,don’t wait until
1,black friday!
2,get black friday deals all year round straight...
3,"never miss a deal, sign up now. 💌"
4,women's fashion
5,best selling
6,lowest price
7,highest price
8,"date, new to old"
9,refine by


----
### Getting X and Y

In [8]:
X = data['content'].values

In [10]:
# Tokenize the words

presence_tokenizer = joblib.load('Presence_Tokenizer.joblib')

X_pred = presence_tokenizer.texts_to_sequences(X)

# Padding

maxlen = 50
X_pred = pad_sequences(X_pred, padding='post', maxlen=maxlen)

X_pred.shape

(165, 50)

------------------
# Check Presence

In [11]:
# Loading the saved model with h5

model = tf.keras.models.load_model('CNN_model.h5')

In [31]:
# apply the pretrained model to the new content data
pred_vec = model.predict_classes(X_pred).flatten()

data['presence'] = pred_vec.tolist()

data



Unnamed: 0,content,presence
0,don’t wait until,1
1,black friday!,1
2,get black friday deals all year round straight...,1
3,"never miss a deal, sign up now. 💌",0
4,women's fashion,1
...,...,...
162,"if you disable all cookies, we will only use t...",1
163,cookie policy,1
164,privacy policy.,1
165,cookies settings,1


In [34]:
# dark pattern content are those where the predicted result equals to 0.
dark = data.loc[data['presence']==0]

In [35]:
dark.to_csv('checkingpresence-01.csv', index = False, header = True)