In [1]:
import pandas as pd
import numpy as np

# web scraper

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import lxml.html
import re
import time

# write to csv file

import csv

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib



# -------------------------------- Web Scrapper ---------------------------

url = 'https://www.currys.ie/ieen/tv-and-home-entertainment/televisions/televisions/301_3002_30002_xx_ba00013738-bv00313852%7Cbv00313851/xx-criteria.html'
#url='https://www.currys.ie/ieen/search-keywords/xx_xx_xx_xx_xx/-wk22_headphones_ie-/xx-criteria.html'

# to avoid opening browser while using selenium
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(ChromeDriverManager().install(),options=option)

driver.get(url)
time.sleep(3)

# get source code -- type: str
html_source = driver.page_source

# key
html = lxml.html.fromstring(html_source)

# obtain all the text under the 'div' tags
items = html.xpath("//text()")

pattern = re.compile("^\s+|\s+$|\n")

clause_text = ""

for item in items:
    line = re.sub(pattern, "", item)
    if len(item) > 1:
        clause_text += line +"\n"

driver.quit()



# -------------------------------- Scrapping Dataset ---------------------------

raw_text = clause_text

# the beginning character of the content, which is the sign we should ignore the content
ignore_str = ',.;{}?#/)!('

# the content we are going to keep to send to models.
content_list = []

# only keep the content that has words count from 2 to 20 (includes).
for line in raw_text.split('\n'):
    if 1<len(line.split())<21 and line[0] not in ignore_str:
        content_list.append([line])

header = ['content']

# create a csv file to save the filtered content for later model analysis.
with open('Websites/pc01.csv', 'w', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    writer.writerows(content_list)
    
    
        
# -------------------------------- Check Presence ---------------------------    


# Loading the saved model with joblib
presence_model = joblib.load('bnb_presence_classifier.joblib')
presence_cv = joblib.load('presence_TfidfVectorizer.joblib')

# New dataset to predict
presence_pred = pd.read_csv('Websites/pc01.csv')


# Filter out the disturibing content to be removed
str_list = ['low to high','high to low','high low','low high','{','ships','ship','®','details',
            'limited edition','cart is currently empty','in cart','out of stock','believe in',
            'today\'s deals','customer service','offer available','offers available', 'collect'
            '% off','in stock soon','problem','UTC','javascript','cookie','cookies','disclaimer']
pattern = '|'.join(str_list)

#presence_pred = presence_pred[~presence_pred.content.str.lower().str.contains(pattern)]



# apply the pretrained model to the new content data
pre_pred_vec = presence_model.predict(presence_cv.transform(presence_pred['content']))

presence_pred['presence'] = pre_pred_vec.tolist()

# dark pattern content are those where the predicted result equals to 0.
dark = presence_pred.loc[presence_pred['presence']==0]

dark.to_csv('Websites/pc01-bnb.csv', index = False, header = True)



Current google-chrome version is 95.0.4638
Get LATEST driver version for 95.0.4638
Driver [/Users/zenglan/.wdm/drivers/chromedriver/mac64/95.0.4638.17/chromedriver] found in cache


In [10]:
# Loading the saved model with joblib
cat_model = joblib.load('lr_category_classifier.joblib')
cat_cv = joblib.load('type_CountVectorizer.joblib')

# mapping of the encoded dark pattern categories.
cat_dic = {0:'Activity Notification', 1:'Countdown Timer', 2:'High-demand Message', 
           3:'Limited-time Message', 4:'Low-stock Message'}

# apply the model and the countvectorizer to the detected dark pattern content data
cat_pred_vec = cat_model.predict(cat_cv.transform(dark['content']))


dark['type'] = cat_pred_vec.tolist()

type_list = dark['type'].tolist()

# get the mapping of the category name and encoded category integers
dark['type_name'] = [cat_dic[int(type)] for type in type_list]

# reset the index of the detected dark pattern list on the webpage.
dark = dark.reset_index(drop=True)

dark

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dark['type'] = cat_pred_vec.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dark['type_name'] = [cat_dic[int(type)] for type in type_list]


Unnamed: 0,content,presence,type,type_name
0,Translations and currency conversions are prov...,0,3,Limited-time Message
1,Remaining Time,0,3,Limited-time Message
2,"Oral-B Genius X Limited, Electric Toothbrush w...",0,4,Low-stock Message
3,Ends in 08:03:56,0,1,Countdown Timer
4,Ends in 08:03:56,0,1,Countdown Timer
5,Ends in 08:03:56,0,1,Countdown Timer
6,Ends in 03:28:56,0,1,Countdown Timer
7,Ends in 05:33:56,0,1,Countdown Timer
8,Ends in 06:28:56,0,1,Countdown Timer
9,Ends in 38:56,0,1,Countdown Timer
