In [31]:
import pandas as pd
import numpy as np

# web scraper

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import lxml.html
import re
import time

# write to csv file

import csv

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib



# -------------------------------- Web Scrapper ---------------------------

url = 'https://www.scan.co.uk/shop/music-and-pro-audio'
#url='https://www.currys.ie/ieen/search-keywords/xx_xx_xx_xx_xx/-wk22_headphones_ie-/xx-criteria.html'

# to avoid opening browser while using selenium
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(ChromeDriverManager().install(),options=option)

driver.get(url)
time.sleep(3)

# get source code -- type: str
html_source = driver.page_source

# key
html = lxml.html.fromstring(html_source)

# obtain all the text under the 'div' tags
items = html.xpath("//text()")

pattern = re.compile("^\s+|\s+$|\n")

clause_text = ""

for item in items:
    line = re.sub(pattern, "", item)
    if len(item) > 1:
        clause_text += line +"\n"

driver.quit()



# -------------------------------- Scrapping Dataset ---------------------------

raw_text = clause_text

# the beginning character of the content, which is the sign we should ignore the content
ignore_str = ',.;{}?#/)!($'

# the content we are going to keep to send to models.
content_list = []

# only keep the content that has words count from 2 to 20 (includes).
for line in raw_text.split('\n'):
    if 1<len(line.split())<21 and line[0] not in ignore_str:
        content_list.append([line])

header = ['content']

# create a csv file to save the filtered content for later model analysis.
with open('Web_Scrap/3-thr50.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    writer.writerows(content_list)
    
    
        
# -------------------------------- Check Presence ---------------------------    


# Loading the saved model with joblib
presence_model = joblib.load('bnb_presence_classifier.joblib')
presence_cv = joblib.load('presence_TfidfVectorizer.joblib')

# New dataset to predict
presence_pred = pd.read_csv('Web_Scrap/3-thr50.csv')


# Filter out the disturibing content to be removed
#str_list = ['low to high','high to low','high low','low high','{','ships','ship','®',
#            'limited edition','cart is currently empty','out of stock','believe in',
#            'today\'s deals','customer service','offer available','offers available', 
#            '% off','in stock soon']
#pattern = '|'.join(str_list)

#presence_pred = presence_pred[~presence_pred.content.str.lower().str.contains(pattern)]



# apply the pretrained model to the new content data
pre_pred_vec = presence_model.predict(presence_cv.transform(presence_pred['content']))

# ---------- apply threshold to be 0.8
# pre_pred_vec = (presence_model.predict_proba(presence_cv.transform(presence_pred['content']))[ : , 1] >= 0.8).astype(int)

presence_pred['presence'] = pre_pred_vec.tolist()

# dark pattern content are those where the predicted result equals to 0.
dark = presence_pred.loc[presence_pred['presence']==0]



dark.to_csv('DP/3-thr50-bnb.csv', index = False, header = True)





Current google-chrome version is 95.0.4638
Get LATEST driver version for 95.0.4638
Driver [/Users/zenglan/.wdm/drivers/chromedriver/mac64/95.0.4638.54/chromedriver] found in cache


In [26]:
pre_pred_vec

array([1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1])

In [27]:
presence_pred['presence'] 

0     1
1     1
2     1
3     0
4     1
5     1
6     1
7     1
8     1
9     1
10    1
11    1
Name: presence, dtype: int64

In [21]:
import sklearn
print('The scikit-learn version is {}.'.format(sklearn.__version__))


The scikit-learn version is 0.23.2.
