# Data Preprocessing

## Import libraries

In [1]:
import numpy as np
import pandas as pd

In [2]:
# useful pandas settings

# Use 2 decimal places in the output display
pd.set_option("display.precision", 2)

# Don't wrap dataframe across additional lines
pd.set_option("display.expand_frame_repr", False)

# Set the maximum widths of columns
pd.set_option("display.max_colwidth", 60)

# Set max rows displayed in output to 20
pd.set_option("display.max_rows", 20)

## Import Data

In [3]:
# import data
df_raw = pd.read_csv('data/data_raw/merged.csv')
df_raw.shape

(52039, 9)

In [4]:
# take a look at the raw data
df_raw.head(2)

Unnamed: 0,author_url,user_id,date,reading_time,title,subtitle,claps,responses,story_url
0,https://towardsdatascience.com/@henriwoodcock,e9ee1187182a,2021-03-01T13:59:37.090Z,4 min read,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...
1,https://towardsdatascience.com/@destingong,fa1913854e95,2021-03-01T05:00:14.502Z,11 min read,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1K,8 responses,https://towardsdatascience.com/semi-automated-explorator...


In [5]:
# list the columns with their datatypes
df_raw.dtypes

author_url      object
user_id         object
date            object
reading_time    object
title           object
subtitle        object
claps           object
responses       object
story_url       object
dtype: object

In [6]:
# count the number of distinct entries
df_raw.nunique()

author_url      12090
user_id         12057
date            51985
reading_time       61
title           51762
subtitle        36663
claps            1139
responses          99
story_url       52038
dtype: int64

<div class="alert alert-block alert-info">
<b>NOTES:</b>
- Both author_url and user_id contain author identifiers, keep only the latter. 
</div>

In [7]:
# drop author url column 
df = df_raw.drop(columns=['author_url'])
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url
0,e9ee1187182a,2021-03-01T13:59:37.090Z,4 min read,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...
1,fa1913854e95,2021-03-01T05:00:14.502Z,11 min read,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1K,8 responses,https://towardsdatascience.com/semi-automated-explorator...


In [8]:
# investigate the duplicated titles
df[df.title.duplicated()==True]

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url
1867,6bee41715a0e,2019-08-14T04:23:32.108Z,5 min read,Naive Bayes Explained,-,418,2 responses,https://towardsdatascience.com/naive-bayes-explained-9d2...
3305,dd2051e0a449,2019-03-04T13:35:15.151Z,8 min read,LOGISTIC REGRESSION CLASSIFIER,-,75,3 responses,https://towardsdatascience.com/logistic-regression-class...
3490,d88e5b6154ef,2019-03-12T11:43:12.796Z,9 min read,Is Data Science a science?,-,206,1 response,https://towardsdatascience.com/data-science-and-ai-for-b...
4032,c510ccc9027c,2019-01-06T19:52:31.210Z,8 min read,Know your enemy,How you can create and defend against adversarial attacks,450,3 responses,https://towardsdatascience.com/know-your-enemy-7f7c5038b...
4268,451599b1142a,2019-01-16T14:02:13.827Z,6 min read,Learn Enough Docker to be Useful,Part 2: A Delicious Dozen Docker Terms You Need to Know,6.6K,8 responses,https://towardsdatascience.com/learn-enough-docker-to-be...
...,...,...,...,...,...,...,...,...
50943,120b86134681,2022-12-30T00:45:15.776Z,10 min read,-,"Python Helper Classes for EDA, Feature…",190,1 response,https://towardsdatascience.com/mastering-data-science-wo...
51189,ff8eba68ccde,2021-08-07T17:56:42.568Z,7 min read,Optimization Algorithms for Machine Learning,Chapter-6: Optimization Problems,25,0 responses,https://towardsdatascience.com/optimization-algorithms-f...
51454,d0e73ebdbb0c,2021-08-14T16:48:55.304Z,6 min read,My Google Foobar journey,WALK-THROUGH,3,0 responses,https://towardsdatascience.com/my-google-foobar-journey-...
51508,e101cd051ea3,2021-08-16T21:21:39.531Z,6 min read,How to Handle Missing Data,The fastest multiple imputation method using XGBoost,147,3 responses,https://towardsdatascience.com/how-to-handle-missing-dat...


<div class="alert alert-block alert-info">
<b>NOTES:</b>
- Several titles appear more than once. Drop all the rows that have duplicated titles. This might be due to the authors editing an initial version. We will keep the last post among each set of duplicates.
</div>

In [9]:
# drop rows with duplicated titles
df.drop_duplicates(subset='title', keep='last', inplace=True)

# check the output
df.shape

(51762, 8)

In [10]:
# change object type column to datetime and extract the date 
df.date = pd.to_datetime(df.date).dt.date

# check the outcome
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url
0,e9ee1187182a,2021-03-01,4 min read,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...
1,fa1913854e95,2021-03-01,11 min read,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1K,8 responses,https://towardsdatascience.com/semi-automated-explorator...


In [11]:
# extract the year as an integer
df['year'] = pd.DatetimeIndex(df['date']).year
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year
0,e9ee1187182a,2021-03-01,4 min read,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...,2021
1,fa1913854e95,2021-03-01,11 min read,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1K,8 responses,https://towardsdatascience.com/semi-automated-explorator...,2021


In [12]:
# extract the month as an integer
df['month'] = pd.DatetimeIndex(df['date']). month
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month
0,e9ee1187182a,2021-03-01,4 min read,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3
1,fa1913854e95,2021-03-01,11 min read,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1K,8 responses,https://towardsdatascience.com/semi-automated-explorator...,2021,3


In [13]:
# extract the day as an integer
df['day'] = pd. DatetimeIndex(df['date']).day
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month,day
0,e9ee1187182a,2021-03-01,4 min read,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3,1
1,fa1913854e95,2021-03-01,11 min read,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1K,8 responses,https://towardsdatascience.com/semi-automated-explorator...,2021,3,1


In [14]:
# extract reading time in minutes and change to integer type
df.reading_time = df.reading_time.apply(lambda x: int(x[:-9]))
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month,day
0,e9ee1187182a,2021-03-01,4,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3,1
1,fa1913854e95,2021-03-01,11,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1K,8 responses,https://towardsdatascience.com/semi-automated-explorator...,2021,3,1


In [15]:
# source: https://dorianlazar.medium.com/scraping-medium-with-python-beautiful-soup-3314f898bbf5

def parse_claps(claps_str):
    
    '''
    Function to parse the number of claps.
    It splits on character 'K' and then transforms the string into an integer,
    also multiplies by 1000 if 'K' is present in the string.
    
    INPUT:
        claps_str (str) - number of claps as a string
    OUTPUT:
        claps (int) - number of claps as an integer
    '''
    if (claps_str is None) or (claps_str == '') or (claps_str.split is None) or (len(claps_str) > 5):
        return 0
    split_claps = claps_str.split('K')
    claps = float(split_claps[0])
    claps = int(claps*1000) if len(split_claps) == 2 else int(claps)
    return claps

In [16]:
# parse the number of claps
df.claps = df.claps.apply(lambda x: parse_claps(x))
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month,day
0,e9ee1187182a,2021-03-01,4,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1 response,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3,1
1,fa1913854e95,2021-03-01,11,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1000,8 responses,https://towardsdatascience.com/semi-automated-explorator...,2021,3,1


In [17]:
# parse the number of responses
df.responses = df.responses.apply(lambda x: int(x.split(' ')[0]))

# check the outcome
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month,day
0,e9ee1187182a,2021-03-01,4,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3,1
1,fa1913854e95,2021-03-01,11,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1000,8,https://towardsdatascience.com/semi-automated-explorator...,2021,3,1


In [18]:
test_str = 'https://towardsdatascience.com/customer-segmentation-in-online-retail-1fc707a6f9e6?source=collection_archive---------2-----------------------'
# extract the article identifier
test_step = test_str[:test_str.index('?')][-12:]
test_step

'1fc707a6f9e6'

In [19]:
# extract the story id from the story_url information
df['story_id'] = df.story_url.apply(lambda x: x[:x.index('?')][-12:])

# check the outcome
df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month,day,story_id
0,e9ee1187182a,2021-03-01,4,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3,1,581a9972805f
1,fa1913854e95,2021-03-01,11,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1000,8,https://towardsdatascience.com/semi-automated-explorator...,2021,3,1,7f96042c9809


In [None]:
# drop columns not needed in our analysis
#df.drop(columns=['date', 'story_url'], inplace=True)

In [20]:
# combine the title and subtitle columns
df['corpus'] = df['title'] + ' ' + df['subtitle']

# check the output
df.head()

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month,day,story_id,corpus
0,e9ee1187182a,2021-03-01,4,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3,1,581a9972805f,Stop using numpy.random.seed() How to set random seeds f...
1,fa1913854e95,2021-03-01,11,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1000,8,https://towardsdatascience.com/semi-automated-explorator...,2021,3,1,7f96042c9809,Semi-Automated Exploratory Data Analysis (EDA) in Python...
2,c4539c5f517b,2021-03-01,7,Should You Become a Data Engineer in 2021?,Data Engineering is the new Data Science,172,1,https://towardsdatascience.com/should-you-become-a-data-...,2021,3,1,4db57b6cce35,Should You Become a Data Engineer in 2021? Data Engineer...
3,720e3a4ac60c,2021-03-01,7,Line of Best Fit in Linear Regression,"Correlation Coefficient, Coefficient of determination, M...",415,0,https://towardsdatascience.com/line-of-best-fit-in-linea...,2021,3,1,13658266fbc8,Line of Best Fit in Linear Regression Correlation Coeffi...
4,76c21e75463a,2021-03-01,5,What’s the Difference Between Shallow and Deep Copies in...,copy() vs deepcopy() in Python,130,1,https://towardsdatascience.com/whats-the-difference-betw...,2021,3,1,ceee1e061926,What’s the Difference Between Shallow and Deep Copies in...


In [21]:
# check the structure of the pre-processed dataframe
df.dtypes

user_id         object
date            object
reading_time     int64
title           object
subtitle        object
claps            int64
responses        int64
story_url       object
year             int64
month            int64
day              int64
story_id        object
corpus          object
dtype: object

In [22]:
# display the number of distinct values in each columns
df.nunique()

user_id         12044
date             1794
reading_time       61
title           51762
subtitle        36477
claps            1136
responses          99
story_url       51762
year                5
month              12
day                31
story_id        51762
corpus          51762
dtype: int64

In [23]:
# display the number of missing values in each column
df.isnull().sum()

user_id         0
date            0
reading_time    0
title           0
subtitle        0
claps           0
responses       0
story_url       0
year            0
month           0
day             0
story_id        0
corpus          0
dtype: int64

## Saved the Preprocessed Data

In [24]:
# save the preprocessed data to a file
df.to_csv(f'data/data_interim/preprocessed_data.csv', index=False)

In [25]:
# check the saved file
new_df = pd.read_csv('data/data_interim/preprocessed_data.csv')
new_df.head(2)

Unnamed: 0,user_id,date,reading_time,title,subtitle,claps,responses,story_url,year,month,day,story_id,corpus
0,e9ee1187182a,2021-03-01,4,Stop using numpy.random.seed(),How to set random seeds for individual classes in Python,150,1,https://towardsdatascience.com/stop-using-numpy-random-s...,2021,3,1,581a9972805f,Stop using numpy.random.seed() How to set random seeds f...
1,fa1913854e95,2021-03-01,11,Semi-Automated Exploratory Data Analysis (EDA) in Python,Comprehensive Data Exploration Process with…,1000,8,https://towardsdatascience.com/semi-automated-explorator...,2021,3,1,7f96042c9809,Semi-Automated Exploratory Data Analysis (EDA) in Python...
