In [1]:

# ----------------------------- Load Packages -------------------------------------------

import pandas as pd
import numpy as np

# web scraper

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
import lxml.html
import re
import time

# write to csv file

import csv

# joblib is a set of tools to provide lightweight pipelining in Python. It provides utilities for saving and loading Python objects that make use of NumPy data structures, efficiently.
import joblib


# ----------------------------- Web Scrapper -------------------------------------------


url = 'https://www.currys.ie/ieen/computing/laptops/laptops/dell-inspiron-15-5502-15-6-laptop-intel-core-i5-256-gb-ssd-silver-10217071-pdt.html'
#url='https://www.currys.ie/ieen/search-keywords/xx_xx_xx_xx_xx/-wk22_headphones_ie-/xx-criteria.html'

# to avoid opening browser while using selenium
option = webdriver.ChromeOptions()
option.add_argument('headless')
driver = webdriver.Chrome(ChromeDriverManager().install(),options=option)

driver.get(url)
time.sleep(2)

# get source code -- type: str
html_source = driver.page_source

# key
html = lxml.html.fromstring(html_source)

# obtain all the text under the 'div' tags
items = html.xpath("//div//text()")

pattern = re.compile("^\s+|\s+$|\n")

clause_text = ""

for item in items:
    line = re.sub(pattern, "", item)
    if len(item) > 1:
        clause_text += line +"\n"

driver.quit()


# --------------------------- Generate Web Scrapping Conetnt ------------------------------------

raw_text = clause_text

# the beginning character of the content, which is the sign we should ignore the content
ignore_str = ',.;{}?#/$'

# the content we are going to keep to send to models.
content_list = []

# only keep the content that has words count from 2 to 20 (includes).
for line in raw_text.split('\n'):
    if 1<len(line.split())<21 and line[0] not in ignore_str:
        content_list.append([line])


header = ['content']

# create a csv file to save the filtered content for later model analysis.
with open('Websites/bug.csv', 'a', encoding='UTF8', newline='') as f:
    writer = csv.writer(f)

    # write the header
    writer.writerow(header)

    # write the data
    writer.writerows(content_list)

        
# ---------------------------- Check Presence --------------------------------------

# Loading the saved model with joblib
presence_model = joblib.load('rf_presence_classifier.joblib')
presence_cv = joblib.load('presence_CountVectorizer.joblib')

# New dataset to predict
presence_pred = pd.read_csv('Websites/pcworld.csv')


# Filter out the disturibing content to be removed
str_list = ['low to high','high to low','high low','low high','{','ships','ship','®',
            'limited edition','cart is currently empty','believe in','today\'s deals']
pattern = '|'.join(str_list)

presence_pred = presence_pred[~presence_pred.content.str.lower().str.contains(pattern)]

# apply the pretrained model to the new content data
pre_pred_vec = presence_model.predict(presence_cv.transform(presence_pred['content']))

presence_pred['presence'] = pre_pred_vec.tolist()

# dark pattern content are those where the predicted result equals to 0.
dark = presence_pred.loc[presence_pred['presence']==0]

# save the dark pattern csv detected
dark.to_csv('Websites/pcworld-rf.csv', index = False, header = True)



Current google-chrome version is 94.0.4606
Get LATEST driver version for 94.0.4606
Get LATEST driver version for 94.0.4606
Trying to download new driver from https://chromedriver.storage.googleapis.com/94.0.4606.61/chromedriver_mac64.zip
Driver has been saved in cache [/Users/zenglan/.wdm/drivers/chromedriver/mac64/94.0.4606.61]


FileNotFoundError: [Errno 2] No such file or directory: 'Websites/pcworld.csv'

In [2]:
# Loading the saved model with joblib
cat_model = joblib.load('mnb_category_classifier.joblib')
cat_cv = joblib.load('category_CountVectorizer.joblib')

# mapping of the encoded dark pattern categories.
cat_dic = {0:'Forced Action', 1:'Misdirection', 2:'Obstruction', 3:'Scarcity', 4:'Sneaking',
           5:'Social Proof', 6:'Urgency'}

# apply the model and the countvectorizer to the detected dark pattern content data
cat_pred_vec = cat_model.predict(cat_cv.transform(dark['content']))


dark['category'] = cat_pred_vec.tolist()

category_list = dark['category'].tolist()

# get the mapping of the category name and encoded category integers
dark['category_name'] = [cat_dic[int(category)] for category in category_list]

# reset the index of the detected dark pattern list on the webpage.
dark = dark.reset_index(drop=True)

dark

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dark['category'] = cat_pred_vec.tolist()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dark['category_name'] = [cat_dic[int(category)] for category in category_list]


Unnamed: 0,content,presence,category,category_name
0,Have one to sell?,0,5,Social Proof
1,About this item,0,5,Social Proof
2,Frequently bought together,0,5,Social Proof
3,This item:,0,5,Social Proof
4,In Stock.,0,3,Scarcity
5,In Stock.,0,3,Scarcity
6,In Stock.,0,3,Scarcity
7,Remaining Time,0,6,Urgency
8,This is a modal window.,0,5,Social Proof
9,This item,0,5,Social Proof
