In [91]:
import pandas as pd
import os
import sys
from ucimlrepo import fetch_ucirepo 

def download_raw_dataset():
    """
    Downloads the Bank Marketing dataset from the UCI Machine Learning Repository.

    The function fetches the dataset using the ucimlrepo library, separates the features and target variables, 
    and returns a combined pandas DataFrame with both features and target variables.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the full dataset with features and target variable.
    """
    # Fetch dataset
    bank_marketing = fetch_ucirepo(id=222)
    
    # Data (as pandas dataframes)
    X = bank_marketing.data.features
    y = bank_marketing.data.targets
    
    # Join the data to make a full dataset
    complete_data = pd.concat([X, y], axis=1)
    
    return complete_data

In [92]:
bank_full = download_raw_dataset()

In [93]:
bank_full

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,,no


In [95]:
def export_raw_dataset():
    """
    Downloads the Bank Marketing dataset and saves it to a CSV file in the 'data/raw/' directory.

    The function fetches the dataset, combines features and target variables, and saves it as a CSV file
    at the specified directory path 'data/raw/bank_marketing.csv'.
    """
    # Download the dataset
    bank_data = download_raw_dataset()
    directory = os.path.join(os.path.dirname(os.getcwd()), 'data/raw')
    

    # Ensure the directory exists
    # os.makedirs('data/raw', exist_ok=True)

    # Save to CSV
    file_path = os.path.join(directory, 'bank_marketing.csv')
    bank_data.to_csv(file_path, index=False)

    print(f"Dataset saved to '{file_path}'")


In [96]:
export_raw_dataset()

Dataset saved to '/Users/jiaquanlim/UBC_MDS/block_5/001_532_viz-2/DSCI-532_2025_12_bank-marketing/data/raw/bank_marketing.csv'


In [102]:
bank_prep = pd.read_csv("../data/raw/bank_marketing.csv")

In [103]:
bank_prep

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45206,51,technician,married,tertiary,no,825,no,no,cellular,17,nov,977,3,-1,0,,yes
45207,71,retired,divorced,primary,no,1729,no,no,cellular,17,nov,456,2,-1,0,,yes
45208,72,retired,married,secondary,no,5715,no,no,cellular,17,nov,1127,5,184,3,success,yes
45209,57,blue-collar,married,secondary,no,668,no,no,telephone,17,nov,508,4,-1,0,,no


In [104]:
job_mapping = {
    "admin.": "Employed",
    "management": "Employed",
    "housemaid": "Employed",
    "entrepreneur": "Employed",
    "blue-collar": "Employed",
    "self-employed": "Employed",
    "technician": "Employed",
    "services": "Employed",
    "unemployed": "Unemployed",
    "unknown": "Unknown",
    "student": "Student",
    "retired": "Retired"
}

# Apply the mapping
bank_prep["job_prep"] = bank_prep["job"].replace(job_mapping).fillna("Unknown")
bank_prep.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y,job_prep
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no,Employed
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no,Employed
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no,Employed
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no,Employed
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no,Unknown


In [105]:
# Initialize variables
year = 2008 
years = []
prev_month = None  # Keep track of previous month

# Assign years based on month order
for month in bank_prep["month"]:
    if prev_month == "dec" and month == "jan":  # If transition from Dec -> Jan, increment year
        year += 1
    years.append(year)
    prev_month = month  # Update previous month

# Assign computed years to DataFrame
bank_prep["year"] = years

bank_prep.head()


Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day_of_week,month,duration,campaign,pdays,previous,poutcome,y,job_prep,year
0,58,management,married,tertiary,no,2143,yes,no,,5,may,261,1,-1,0,,no,Employed,2008
1,44,technician,single,secondary,no,29,yes,no,,5,may,151,1,-1,0,,no,Employed,2008
2,33,entrepreneur,married,secondary,no,2,yes,yes,,5,may,76,1,-1,0,,no,Employed,2008
3,47,blue-collar,married,,no,1506,yes,no,,5,may,92,1,-1,0,,no,Employed,2008
4,33,,single,,no,1,no,no,,5,may,198,1,-1,0,,no,Unknown,2008


In [106]:
directory = os.path.join(os.path.dirname(os.getcwd()), 'data/processed')
file_path = os.path.join(directory, 'prep_bank_marketing.csv')
bank_prep.to_csv(file_path, index=False)