## Initialize Cleaning

### Import Packages and Raw_data

In [183]:
import pandas as pd
from functools import wraps
import datetime as dt
import numpy as np
import glob

### Import File

In [184]:
# set mode 
# "sample" - onl one file
# "all" - all files in partial

mode = "all"

path = '../../data/partial/' 

if mode == "sample":
    init_df = pd.read_csv(f"{path}articles_partial_1.csv.zip", compression="zip")
    file_name = "sample_file"
elif mode == "all":
        file_name = "all_articles"
        print("All Mode")
        init_df = pd.DataFrame()
        csv_files = glob.glob(os.path.join(path, "*.csv.zip"))
        print(csv_files)
        for f in csv_files:
            
            # read the csv file
            df = pd.read_csv(f, compression="zip")
            
            # print the location and filename
            print('Location:', f)
            print('File Name:', f.split("\\")[-1])
            
            init_df = pd.concat([init_df, df])
else: 
    raise ValueError("No or unknown is set. Please choose a mode 'sample' or 'all' ")


init_df.info()


All Mode
['../../data/partial/articles_partial_7.csv.zip', '../../data/partial/articles_partial_14_1.csv.zip', '../../data/partial/articles_partial_12_1.csv.zip', '../../data/partial/articles_partial_4.csv.zip', '../../data/partial/articles_partial_5.csv.zip', '../../data/partial/articles_partial_13_1.csv.zip', '../../data/partial/articles_partial_9.csv.zip', '../../data/partial/articles_partial_8.csv.zip', '../../data/partial/articles_partial_10_1.csv.zip', '../../data/partial/articles_partial_1.csv.zip', '../../data/partial/articles_partial_3.csv.zip', '../../data/partial/articles_partial_2.csv.zip']


  df = pd.read_csv(f, compression="zip")


Location: ../../data/partial/articles_partial_7.csv.zip
File Name: ../../data/partial/articles_partial_7.csv.zip
Location: ../../data/partial/articles_partial_14_1.csv.zip
File Name: ../../data/partial/articles_partial_14_1.csv.zip


  df = pd.read_csv(f, compression="zip")


Location: ../../data/partial/articles_partial_12_1.csv.zip
File Name: ../../data/partial/articles_partial_12_1.csv.zip
Location: ../../data/partial/articles_partial_4.csv.zip
File Name: ../../data/partial/articles_partial_4.csv.zip
Location: ../../data/partial/articles_partial_5.csv.zip
File Name: ../../data/partial/articles_partial_5.csv.zip
Location: ../../data/partial/articles_partial_13_1.csv.zip
File Name: ../../data/partial/articles_partial_13_1.csv.zip


  df = pd.read_csv(f, compression="zip")


Location: ../../data/partial/articles_partial_9.csv.zip
File Name: ../../data/partial/articles_partial_9.csv.zip


  df = pd.read_csv(f, compression="zip")


Location: ../../data/partial/articles_partial_8.csv.zip
File Name: ../../data/partial/articles_partial_8.csv.zip


  df = pd.read_csv(f, compression="zip")


Location: ../../data/partial/articles_partial_10_1.csv.zip
File Name: ../../data/partial/articles_partial_10_1.csv.zip


  df = pd.read_csv(f, compression="zip")


Location: ../../data/partial/articles_partial_1.csv.zip
File Name: ../../data/partial/articles_partial_1.csv.zip
Location: ../../data/partial/articles_partial_3.csv.zip
File Name: ../../data/partial/articles_partial_3.csv.zip


  df = pd.read_csv(f, compression="zip")


Location: ../../data/partial/articles_partial_2.csv.zip
File Name: ../../data/partial/articles_partial_2.csv.zip
<class 'pandas.core.frame.DataFrame'>
Int64Index: 352526 entries, 0 to 32314
Data columns (total 33 columns):
 #   Column                    Non-Null Count   Dtype  
---  ------                    --------------   -----  
 0   Unnamed: 0                352526 non-null  int64  
 1   id                        352526 non-null  object 
 2   type                      352526 non-null  object 
 3   sectionName               352526 non-null  object 
 4   webPublicationDate        352526 non-null  object 
 5   webTitle                  352526 non-null  object 
 6   isHosted                  352526 non-null  bool   
 7   pillarName                349648 non-null  object 
 8   headline                  352526 non-null  object 
 9   trailText                 352522 non-null  object 
 10  byline                    342768 non-null  object 
 11  wordcount                 352526 non-null  i

### Logging

In [185]:
def log_step(func):
    @wraps(func)
    def wrapper(*args, **kwargs):
        tic = dt.datetime.now()
        result = func(*args, **kwargs)
        time_taken = str(dt.datetime.now() - tic)
        print(f"{func.__name__}:\n shape={result.shape} took {time_taken}s\n")
        return result

    return wrapper

## Pipeline - Functions

### Start Pipeline

In [186]:
@log_step
def init_pipeline(df):
    return df.copy()

In [201]:
def drop_columns(df):
    df = df.drop(columns=[
        "contributorBio",
        "displayHint",
        "allowUgc",
        "membershipAccess"
        ]
        ) 
    
    return df

### Remove duplicates

In [188]:
@log_step
def remove_duplicates(df):
        return df.drop_duplicates()

### Missing Values

In [189]:
@log_step
def missing_values(df):
    df = df.replace("NaN", np.nan)
    
    # String
    df["trailText"] = df["trailText"].fillna("")
    df["bodyText"] = df["bodyText"].fillna("")
    df["byline"] = df["byline"].fillna("unknown")
    df["pillarName"] = df["pillarName"].fillna("None")

    # Numerical 
    df["starRating"] = df["starRating"].fillna(99)
    df["newspaperPageNumber"] = df["newspaperPageNumber"].fillna(0)

    #Boolean
    df["liveBloggingNow"] = df["liveBloggingNow"].fillna(False)
    df["commentable"] = df["commentable"].fillna(False)
    df["legallySensitive"] = df["legallySensitive"].fillna(False)
    df["isLive"] = df["isLive"].fillna(False)
    df["sensitive"] = df["sensitive"].fillna(False)

    # Dates
    df["firstPublicationDate"] = df["firstPublicationDate"].apply(lambda x: x["webPublicationDate"] if x == np.nan else x)

    df["commentCloseDate"] = df["commentCloseDate"].fillna(dt.datetime.strptime("1900-01-01", "%Y-%m-%d"))
    df["newspaperEditionDate"] = df["newspaperEditionDate"].fillna(dt.datetime.strptime("1900-01-01", "%Y-%m-%d"))
    df["scheduledPublicationDate"] = df["scheduledPublicationDate"].fillna(dt.datetime.strptime("1900-01-01", "%Y-%m-%d"))
    

    return df

### Adjust data types

In [190]:
@log_step
def adjust_data_types(df):
       
   # Strings
   df["id"] = df["id"].astype("string")
   df["byline"] = df["byline"].astype("string")
   df["trailText"] = df["trailText"].astype("string")
   df["sectionName"] = df["sectionName"].astype("string")
   df["type"] = df["type"].astype("string")
   df["webTitle"] = df["webTitle"].astype("string")
   df["pillarName"] = df["pillarName"].astype("string")
   df["headline"] = df["headline"].astype("string")
   df["bodyText"] = df["bodyText"].astype("string")
   
   # Numerical
   df["wordcount"] = df["wordcount"].astype("int")


   # Categorical
   df["productionOffice"] = df["productionOffice"].astype("category")
   

   # boolean
   df["isLive"] = df["isLive"].map({"true": True, "false": False})
   df["commentable"] = df["commentable"].map({"true": True, np.nan: False})
   df["liveBloggingNow"] = df["liveBloggingNow"].map({"true": True, np.nan: False})
   df["legallySensitive"] = df["legallySensitive"].map({"true": True, np.nan: False})
   df["isLive"] = df["isLive"].map({"true": True, np.nan: False})
   
   # Datetimes
   df["webPublicationDate"] = pd.to_datetime(df["webPublicationDate"])
   df["firstPublicationDate"] = pd.to_datetime(df["firstPublicationDate"])
   
   
   df["commentCloseDate"] = pd.to_datetime(df["commentCloseDate"])
   df["newspaperEditionDate"] = pd.to_datetime(df["newspaperEditionDate"])
   df["scheduledPublicationDate"] = pd.to_datetime(df["scheduledPublicationDate"])

   
   
   return df

### CSV

In [191]:
@log_step
def create_csv(df):
    df.to_csv(f"../../data/clean/{file_name}_clean.csv.zip", index=False, compression="zip")
    df.info()
    return df

## Run Cleaning

In [202]:
articles = (
init_df
    .pipe(init_pipeline)       
    .pipe(drop_columns)       
    .pipe(remove_duplicates)              
    .pipe(adjust_data_types)    
    .pipe(missing_values)    
    .pipe(create_csv)      
)


init_pipeline:
 shape=(352526, 33) took 0:00:00.505567s

remove_duplicates:
 shape=(352526, 29) took 0:00:02.455911s

adjust_data_types:
 shape=(352526, 29) took 0:00:02.512935s

missing_values:
 shape=(352526, 29) took 0:00:03.224134s

<class 'pandas.core.frame.DataFrame'>
Int64Index: 352526 entries, 0 to 32314
Data columns (total 29 columns):
 #   Column                    Non-Null Count   Dtype              
---  ------                    --------------   -----              
 0   Unnamed: 0                352526 non-null  int64              
 1   id                        352526 non-null  string             
 2   type                      352526 non-null  string             
 3   sectionName               352526 non-null  string             
 4   webPublicationDate        352526 non-null  datetime64[ns, UTC]
 5   webTitle                  352526 non-null  string             
 6   isHosted                  352526 non-null  bool               
 7   pillarName                352526 non

In [200]:
articles.iloc[:,10:].isna().sum()

byline                           0
wordcount                        0
firstPublicationDate            58
lastModified                     0
productionOffice                 0
publication                      0
legallySensitive                 0
isLive                           0
bodyText                         0
commentCloseDate                 0
commentable                      0
newspaperPageNumber              0
newspaperEditionDate             0
sensitive                        0
liveBloggingNow                  0
starRating                       0
membershipAccess            352433
tagWebTitle                      0
tagId                            0
scheduledPublicationDate         0
dtype: int64

In [199]:
articles.sensitive.sample(20)

10570    False
30496    False
375      False
25433    False
11380    False
17875    False
14610    False
17594    False
23677    False
31502    False
9186      True
20955    False
11359    False
10328    False
9470     False
20589    False
26463    False
18880    False
7694      True
26974    False
Name: sensitive, dtype: bool