In [2]:
import pandas as pd
import numpy as np

------
### Seperate Dataset Import

In [3]:
# ---- import dataset of the labeled dataset
label = pd.read_csv('labeled_df.csv')
label

Unnamed: 0,Pattern String,classification,Pattern Type
0,Only 2 left,0.0,Low-stock Message
1,Only 3 left,0.0,Low-stock Message
2,9 people are viewing this.,0.0,Activity Notification
3,5338 people viewed this in the last hour,0.0,Activity Notification
4,"Crystal Li in Flushing, United States purchased a",0.0,Activity Notification
...,...,...,...
934,Get 20% off Sample Doors! ×Offer Ends In 58 : ...,,Countdown Timer
935,HURRY! £32.99 UK DELIVERY ENDS SOON,,Limited-time Message
936,40% OFF EVERYTHING* ENTER: 40PLT + FREE SHIPPI...,,Limited-time Message
937,Limited Time Only · Online Only,,Limited-time Message


In [4]:
label.shape, label.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 939 entries, 0 to 938
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Pattern String  939 non-null    object 
 1   classification  817 non-null    float64
 2   Pattern Type    939 non-null    object 
dtypes: float64(1), object(2)
memory usage: 22.1+ KB


((939, 3), None)

In [5]:
# ---- import dataset of the unlabeled dataset
unlabel = pd.read_csv('unlabeled_df.csv')
unlabel

Unnamed: 0,Pattern String,classification,Pattern Type
0,Ends in 07:42:09,0.0,Countdown Timer
1,Ends in 07:37:10,0.0,Countdown Timer
2,Ends in 02:27:10,0.0,Countdown Timer
3,Ends in 04:17:10,0.0,Countdown Timer
4,Ends in 01:57:10,0.0,Countdown Timer
...,...,...,...
113,"5,000+ bought this",0.0,Activity Notification
114,"20,000+ bought this",0.0,Activity Notification
115,"50,000+ bought this",0.0,Activity Notification
116,50+ bought this,0.0,Activity Notification


In [6]:
unlabel.shape, unlabel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 118 entries, 0 to 117
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   Pattern String  118 non-null    object 
 1   classification  118 non-null    float64
 2   Pattern Type    118 non-null    object 
dtypes: float64(1), object(2)
memory usage: 2.9+ KB


((118, 3), None)

------
### Merge Datasets

In [9]:
# Merge two datasets based on the Pattern String column
# Stack the DataFrames on top of each other

df = pd.concat([label, unlabel], axis=0)

df

Unnamed: 0,Pattern String,classification,Pattern Type
0,Only 2 left,0.0,Low-stock Message
1,Only 3 left,0.0,Low-stock Message
2,9 people are viewing this.,0.0,Activity Notification
3,5338 people viewed this in the last hour,0.0,Activity Notification
4,"Crystal Li in Flushing, United States purchased a",0.0,Activity Notification
...,...,...,...
113,"5,000+ bought this",0.0,Activity Notification
114,"20,000+ bought this",0.0,Activity Notification
115,"50,000+ bought this",0.0,Activity Notification
116,50+ bought this,0.0,Activity Notification


In [10]:
# Check the distribution of Pattern Type

print(df['Pattern Type'].value_counts())

Activity Notification    412
Low-stock Message        398
Countdown Timer          140
Limited-time Message      83
High-demand Message       24
Name: Pattern Type, dtype: int64


In [11]:
# Select only 2 columns in the dataset

columns = ['Pattern String','Pattern Type']

df = pd.DataFrame(df, columns = columns)

df

Unnamed: 0,Pattern String,Pattern Type
0,Only 2 left,Low-stock Message
1,Only 3 left,Low-stock Message
2,9 people are viewing this.,Activity Notification
3,5338 people viewed this in the last hour,Activity Notification
4,"Crystal Li in Flushing, United States purchased a",Activity Notification
...,...,...
113,"5,000+ bought this",Activity Notification
114,"20,000+ bought this",Activity Notification
115,"50,000+ bought this",Activity Notification
116,50+ bought this,Activity Notification


In [14]:
# Save as enriched csv file for type classifier training

df.to_csv('enriched_type_df.csv', index = False, header = True)
