# Import Dataset
------

In [14]:
import pandas as pd
import numpy as np

# ---- import the Normal dataset
nor_df = pd.read_csv('clean_normal.csv')
dp_df = pd.read_csv('clean_dp.csv')
prev_df = pd.read_csv('remove_noise.csv')

nor_df.shape, dp_df.shape, prev_df.shape

((5648, 1), (119, 1), (2241, 2))

In [15]:
nor_df.head(5)

Unnamed: 0,content
0,Skip to main content
1,Deliver to
2,Select the department you want to search in
3,All Departments
4,Arts & Crafts


In [16]:
dp_df.head(5)

Unnamed: 0,content
0,Ends in 07:42:09
1,Ends in 07:37:10
2,Ends in 02:27:10
3,Ends in 04:17:10
4,Ends in 01:57:10


In [17]:
prev_df.head(5)

Unnamed: 0,Pattern String,classification
0,FREE SHIPPING ON ORDERS OVER $100!,1
1,SOME EXCLUSIONS APPLY - LEARN MORE,1
2,HAVE A QUESTION? - CONTACT US,1
3,WELCOME TO 034MOTORSPORT!,1
4,SHOP AUDISHOP VOLKSWAGENPERFORMANCE SOFTWARE03...,1


### Give DP content the classification as Dark Pattern 
--- classification = 0

In [18]:
dp_df['classification'] = 0

dp_df.head(5)

Unnamed: 0,content,classification
0,Ends in 07:42:09,0
1,Ends in 07:37:10,0
2,Ends in 02:27:10,0
3,Ends in 04:17:10,0
4,Ends in 01:57:10,0


### Give Normal content the classification as Not Dark Pattern 
--- classification = 1

In [19]:
nor_df['classification'] = 1

nor_df.head(5)

Unnamed: 0,content,classification
0,Skip to main content,1
1,Deliver to,1
2,Select the department you want to search in,1
3,All Departments,1
4,Arts & Crafts,1


### Change header names for merging
-----

In [20]:
# change the header name of the raw table

dp_df = dp_df.rename(columns={"content": "Pattern String"})

dp_df.head(5)

Unnamed: 0,Pattern String,classification
0,Ends in 07:42:09,0
1,Ends in 07:37:10,0
2,Ends in 02:27:10,0
3,Ends in 04:17:10,0
4,Ends in 01:57:10,0


In [21]:
nor_df = nor_df.rename(columns={"content": "Pattern String"})

nor_df.head(5)

Unnamed: 0,Pattern String,classification
0,Skip to main content,1
1,Deliver to,1
2,Select the department you want to search in,1
3,All Departments,1
4,Arts & Crafts,1


## Merge the datasets
------

In [22]:
enrich_df = pd.concat([dp_df,nor_df,prev_df],ignore_index = True)

enrich_df.shape, enrich_df.head(5)

((8008, 2),
      Pattern String  classification
 0  Ends in 07:42:09               0
 1  Ends in 07:37:10               0
 2  Ends in 02:27:10               0
 3  Ends in 04:17:10               0
 4  Ends in 01:57:10               0)

## Remove duplicates rows 
-----

In [23]:
 # For later training the model, we should remove the duplicate input to reduce overfitting.

enrich_df = enrich_df.drop_duplicates(subset="Pattern String")

enrich_df.shape

(7952, 2)

In [24]:
enrich_df.head(5)

Unnamed: 0,Pattern String,classification
0,Ends in 07:42:09,0
1,Ends in 07:37:10,0
2,Ends in 02:27:10,0
3,Ends in 04:17:10,0
4,Ends in 01:57:10,0


# Output csv files of normal content and dark pattern
-----

In [25]:
enrich_df.to_csv('enriched_data.csv', index = False, header = True)