In [1]:
import pandas as pd
import numpy as np

------
### Enriched Dataset Import

In [2]:
# ---- import dataset from the Princeton Article
df = pd.read_csv('enriched_data.csv')
df.head(5)

Unnamed: 0,Pattern String,classification
0,Ends in 07:42:09,0
1,Ends in 07:37:10,0
2,Ends in 02:27:10,0
3,Ends in 04:17:10,0
4,Ends in 01:57:10,0


In [3]:
df.shape, df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7952 entries, 0 to 7951
Data columns (total 2 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   Pattern String  7952 non-null   object
 1   classification  7952 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 124.4+ KB


((7952, 2), None)

------
### Select Dark Patterns in the dataset

In [5]:
dp = df.loc[df['classification'] == 0]
dp

Unnamed: 0,Pattern String,classification
0,Ends in 07:42:09,0
1,Ends in 07:37:10,0
2,Ends in 02:27:10,0
3,Ends in 04:17:10,0
4,Ends in 01:57:10,0
...,...,...
7947,Hurry Up ! Just 2 Items Left in stock,0
7948,Hurry! Only 4 left in stock.,0
7949,ONLY 13 LEFT,0
7950,HURRY! ONLY 14 LEFT IN STOCK.,0


----
### Check the Dark Patterns in the Princeton Dataset

In [6]:
# ---- import dataset from the Princeton Article
df2 = pd.read_csv('dark_patterns.csv')
df2.head(5)

Unnamed: 0,Pattern String,Comment,Pattern Category,Pattern Type,Where in website?,Deceptive?,Website Page
0,Collin P. from Grandview Missouri just bought ...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://alaindupetit.com/collections/all-suits...
1,"Faith in Glendale, United States purchased a C...",Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bonescoffee.com/products/strawberry-ch...
2,Sharmeen Atif From Karachi just bought Stylish...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://brandsego.com/collections/under-rs-99/...
3,9 people are viewing this.,Product detail,Social Proof,Activity Notification,Product Page,No,https://brightechshop.com/products/ambience-so...
4,5338 people viewed this in the last hour,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bumpboxes.com/


In [7]:
df2.shape,df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1818 entries, 0 to 1817
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Pattern String     1512 non-null   object
 1   Comment            1798 non-null   object
 2   Pattern Category   1818 non-null   object
 3   Pattern Type       1818 non-null   object
 4   Where in website?  1818 non-null   object
 5   Deceptive?         1818 non-null   object
 6   Website Page       1818 non-null   object
dtypes: object(7)
memory usage: 99.5+ KB


((1818, 7), None)

In [8]:
# Only keep the 5 Pattern Types we need to train the model

types = ['Low-stock Message','Activity Notification',
         'Countdown Timer','Limited-time Message','High-demand Message']

df2 = df2[df2['Pattern Type'].isin(types)]

df2.head(5)

Unnamed: 0,Pattern String,Comment,Pattern Category,Pattern Type,Where in website?,Deceptive?,Website Page
0,Collin P. from Grandview Missouri just bought ...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://alaindupetit.com/collections/all-suits...
1,"Faith in Glendale, United States purchased a C...",Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bonescoffee.com/products/strawberry-ch...
2,Sharmeen Atif From Karachi just bought Stylish...,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://brandsego.com/collections/under-rs-99/...
3,9 people are viewing this.,Product detail,Social Proof,Activity Notification,Product Page,No,https://brightechshop.com/products/ambience-so...
4,5338 people viewed this in the last hour,Periodic popup,Social Proof,Activity Notification,Product Page,No,https://bumpboxes.com/


In [9]:
df2.shape

(1473, 7)

In [10]:
df2.info() 

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1473 entries, 0 to 1697
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Pattern String     1228 non-null   object
 1   Comment            1461 non-null   object
 2   Pattern Category   1473 non-null   object
 3   Pattern Type       1473 non-null   object
 4   Where in website?  1473 non-null   object
 5   Deceptive?         1473 non-null   object
 6   Website Page       1473 non-null   object
dtypes: object(7)
memory usage: 92.1+ KB


In [12]:
# ---- select from the dataset when 'Pattern String' is not NaN values.
df2 = df2[pd.notnull(df2["Pattern String"])]
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1228 entries, 0 to 1697
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Pattern String     1228 non-null   object
 1   Comment            1217 non-null   object
 2   Pattern Category   1228 non-null   object
 3   Pattern Type       1228 non-null   object
 4   Where in website?  1228 non-null   object
 5   Deceptive?         1228 non-null   object
 6   Website Page       1228 non-null   object
dtypes: object(7)
memory usage: 76.8+ KB


In [13]:
# Check the distribution of Pattern Type

print(df2['Pattern Type'].value_counts())

Low-stock Message        631
Activity Notification    313
Countdown Timer          149
Limited-time Message      88
High-demand Message       47
Name: Pattern Type, dtype: int64


In [15]:
# Select only 2 columns in the dataset

columns = ['Pattern String','Pattern Type']

df2 = pd.DataFrame(df2, columns = columns)

df2

Unnamed: 0,Pattern String,Pattern Type
0,Collin P. from Grandview Missouri just bought ...,Activity Notification
1,"Faith in Glendale, United States purchased a C...",Activity Notification
2,Sharmeen Atif From Karachi just bought Stylish...,Activity Notification
3,9 people are viewing this.,Activity Notification
4,5338 people viewed this in the last hour,Activity Notification
...,...,...
1693,Hurry Up ! Just 2 Items Left in stock,Low-stock Message
1694,Hurry! Only 4 left in stock.,Low-stock Message
1695,ONLY 13 LEFT,Low-stock Message
1696,HURRY! ONLY 14 LEFT IN STOCK.,Low-stock Message


----
## Merge two dataset

In [17]:
dp

Unnamed: 0,Pattern String,classification
0,Ends in 07:42:09,0
1,Ends in 07:37:10,0
2,Ends in 02:27:10,0
3,Ends in 04:17:10,0
4,Ends in 01:57:10,0
...,...,...
7947,Hurry Up ! Just 2 Items Left in stock,0
7948,Hurry! Only 4 left in stock.,0
7949,ONLY 13 LEFT,0
7950,HURRY! ONLY 14 LEFT IN STOCK.,0


In [18]:
df2

Unnamed: 0,Pattern String,Pattern Type
0,Collin P. from Grandview Missouri just bought ...,Activity Notification
1,"Faith in Glendale, United States purchased a C...",Activity Notification
2,Sharmeen Atif From Karachi just bought Stylish...,Activity Notification
3,9 people are viewing this.,Activity Notification
4,5338 people viewed this in the last hour,Activity Notification
...,...,...
1693,Hurry Up ! Just 2 Items Left in stock,Low-stock Message
1694,Hurry! Only 4 left in stock.,Low-stock Message
1695,ONLY 13 LEFT,Low-stock Message
1696,HURRY! ONLY 14 LEFT IN STOCK.,Low-stock Message


In [19]:
df_merge = pd.merge(dp,df2, on='Pattern String', how='outer')

df_merge

Unnamed: 0,Pattern String,classification,Pattern Type
0,Ends in 07:42:09,0.0,
1,Ends in 07:37:10,0.0,
2,Ends in 02:27:10,0.0,
3,Ends in 04:17:10,0.0,
4,Ends in 01:57:10,0.0,
...,...,...,...
1461,Get 20% off Sample Doors! ×Offer Ends In 58 : ...,,Countdown Timer
1462,HURRY! £32.99 UK DELIVERY ENDS SOON,,Limited-time Message
1463,40% OFF EVERYTHING* ENTER: 40PLT + FREE SHIPPI...,,Limited-time Message
1464,Limited Time Only · Online Only,,Limited-time Message


In [28]:
# Remove duplicates

df = df_merge.drop_duplicates(subset = 'Pattern String')

df

Unnamed: 0,Pattern String,classification,Pattern Type
0,Ends in 07:42:09,0.0,
1,Ends in 07:37:10,0.0,
2,Ends in 02:27:10,0.0,
3,Ends in 04:17:10,0.0,
4,Ends in 01:57:10,0.0,
...,...,...,...
1461,Get 20% off Sample Doors! ×Offer Ends In 58 : ...,,Countdown Timer
1462,HURRY! £32.99 UK DELIVERY ENDS SOON,,Limited-time Message
1463,40% OFF EVERYTHING* ENTER: 40PLT + FREE SHIPPI...,,Limited-time Message
1464,Limited Time Only · Online Only,,Limited-time Message


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1177 entries, 0 to 1465
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Pattern String  1177 non-null   object 
 1   classification  1055 non-null   float64
 2   Pattern Type    939 non-null    object 
dtypes: float64(1), object(2)
memory usage: 36.8+ KB


-----
### Seperate the labeled data and un-labeled data


In [32]:
un_label_df = df[pd.isnull(df["Pattern Type"])]

labeled_df = df[pd.notnull(df["Pattern Type"])]

un_label_df.shape, labeled_df.shape

((238, 3), (939, 3))

In [33]:
# Save as seperate csv file for manual labelling

un_label_df.to_csv('unlabeled_df.csv', index = False, header = True)

labeled_df.to_csv('labeled_df.csv', index = False, header = True)