# Import requried dependencies

In [2]:
import os
import pandas as pd
import numpy as np
from google.colab import drive

  from pandas_profiling import ProfileReport


# Mount google drive

In [3]:
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
drive_path = "/content/drive/MyDrive"
dataset_path = f"{drive_path}/Datasets"
aux_dataset_path = f"{drive_path}/Datasets/auxiliary-data"
transformed_dataset_path = f"{drive_path}/Datasets/transformed"

stock_inp_path = f"{aux_dataset_path}/sg-stock-prices.csv"
stock_out_path = f"{transformed_dataset_path}/sg-stock-prices.csv"
coe_inp_path = f"{aux_dataset_path}/sg-coe-prices.csv"
coe_out_path = f"{transformed_dataset_path}/sg-coe-prices.csv"

# Transform the auxillary dataset to get the economic indicators

In [7]:
month_to_num_map = {
    "january": "01",
    "february": "02",
    "march": "03",
    "april": "04",
    "may": "05",
    "june": "06",
    "july": "07",
    "august": "08",
    "september": "09",
    "october": "10",
    "november": "11",
    "december": "12",
}

def transform_coe_prices(inp_path, out_path):
    '''
    Take the original sg-coe-prices.csv and transform it to create a new transformed sg-coe-prices.csv which has the following columns

    date, coe_price_indicator

    Where date is of format "year-month" and coe_price_indicator is a number

    :param inp_path: input csv path
    :param out_path: output csv path for storing the new dataframe
    :return: transformed dataframe
    '''
    coe_prices_df = pd.read_csv(inp_path)
    coe_prices_df["month"] = coe_prices_df["month"].map(month_to_num_map)
    coe_prices_df["date"] = pd.to_datetime(coe_prices_df['year'].astype(str) + '-' + coe_prices_df['month'].astype(str), format="%Y-%m").dt.strftime('%Y-%m')

    # getting price indicator for each row
    coe_prices_df["coe_price_indicator"] = (coe_prices_df["bids"] / coe_prices_df["quota"]) * coe_prices_df["price"]

    # normalize the price indicator values between 0-1
    coe_prices_price_indicator_min = coe_prices_df["coe_price_indicator"].min()
    coe_prices_price_indicator_max = coe_prices_df["coe_price_indicator"].max()
    coe_prices_df["coe_price_indicator"] = (coe_prices_df["coe_price_indicator"] - coe_prices_price_indicator_min) / (
            coe_prices_price_indicator_max - coe_prices_price_indicator_min)

    # grouping by year and category
    avg_price_indicator_per_month_per_category = coe_prices_df.groupby(["date", "category"])[
        "coe_price_indicator"].mean().reset_index()
    total_price_indicator_per_month = avg_price_indicator_per_month_per_category.groupby(["date"])[
        "coe_price_indicator"].sum().reset_index()

    # Create a date range with all months from the minimum to maximum date
    start_date = pd.to_datetime('2021-01', format='%Y-%m')
    end_date = pd.to_datetime('2023-12', format='%Y-%m')
    date_range = pd.date_range(start=start_date, end=end_date, freq='M')
    date_df = pd.DataFrame({'date': date_range})
    date_df["date"] = date_df["date"].dt.strftime('%Y-%m')

    # Merge date_df with total_price_indicator_per_month to fill missing months
    merged_df = date_df.merge(total_price_indicator_per_month, on='date', how='left')

    # adding month and the year as separate columns
    merged_df[["year", "month"]] = merged_df["date"].str.split("-", expand=True)
    total_price_indicator_per_month[["year", "month"]] = total_price_indicator_per_month["date"].str.split("-", expand=True)

    # Fill missing prices with the calculated average
    average_price_by_year = total_price_indicator_per_month.groupby(["year"])['coe_price_indicator'].mean()

    # Apply the fill_missing_with_average function to fill missing values
    merged_df['coe_price_indicator'] = merged_df.apply(
        lambda row: average_price_by_year.get(row["year"], 0) if pd.isna(row['coe_price_indicator']) else row['coe_price_indicator']
        , axis=1
    )

    # drop the year and month from this
    merged_df = merged_df.drop(columns=["year", "month"])

    # save this df into the output file
    merged_df.to_csv(out_path, index=False)

    # return the final df
    return merged_df

def transform_stock_prices(inp_path, out_path):
    '''
    Take the original sg-stock-prices.csv and transform it to create a new transformed sg-stock-prices.csv which has the following columns

    date, stock_price_indicator

    Where date is of format "year-month" and stock_price_indicator is a number

    :param inp_path: input csv path
    :param out_path: output csv path for storing the new dataframe
    :return: transformed dataframe
    '''
    stock_prices_df = pd.read_csv(inp_path)

    stock_prices_df["stock_price"] = stock_prices_df["adjusted_close"]
    stock_prices_df["date"] = pd.to_datetime(stock_prices_df["date"])

    # Filter rows where the year is greater than or equal to 2021
    stock_prices_df = stock_prices_df[stock_prices_df['date'].dt.year >= 2021]

    # Extract year and month into a new column 'year_month'
    stock_prices_df['date'] = stock_prices_df['date'].dt.strftime('%Y-%m')

    # drop unnecessary columns
    stock_prices_df = stock_prices_df.drop(columns=["symbol", "open", "high", "low", "close", "adjusted_close"])

    # normalize the price indicator values between 0-1
    stock_price_min = stock_prices_df["stock_price"].min()
    stock_price_max = stock_prices_df["stock_price"].max()
    stock_prices_df["stock_price"] = (stock_prices_df["stock_price"] - stock_price_min) / (stock_price_max - stock_price_min)

    # Group by 'name' and 'year_month' to calculate the total stock price for each month for each company
    stock_prices_for_company_for_month_df = stock_prices_df.groupby(['name', 'date'])['stock_price'].sum().reset_index()

    # Group by 'date' and find the average stock price across companies for each particular month
    avg_stock_price_per_month = stock_prices_for_company_for_month_df.groupby(["date"])["stock_price"].mean().reset_index()

    # Create a date range with all months from the minimum to maximum date
    start_date = pd.to_datetime('2021-01', format='%Y-%m')
    end_date = pd.to_datetime('2023-12', format='%Y-%m')
    date_range = pd.date_range(start=start_date, end=end_date, freq='M')
    date_df = pd.DataFrame({'date': date_range})
    date_df["date"] = date_df["date"].dt.strftime('%Y-%m')

    # Merge date_df with total_price_indicator_per_month to fill missing months
    merged_df = date_df.merge(avg_stock_price_per_month, on='date', how='left')

    # adding month and the year as separate columns
    merged_df[["year", "month"]] = merged_df["date"].str.split("-", expand=True)
    avg_stock_price_per_month[["year", "month"]] = avg_stock_price_per_month["date"].str.split("-", expand=True)

    # Fill missing prices with the calculated average
    average_price_by_year = avg_stock_price_per_month.groupby(["year"])['stock_price'].mean()

    # Apply the fill_missing_with_average function to fill missing values
    merged_df['stock_price'] = merged_df.apply(
        lambda row: average_price_by_year.get(row["year"], 0) if pd.isna(row['stock_price']) else row['stock_price']
        , axis=1
    )

    # drop the year and month from this
    merged_df = merged_df.drop(columns=["year", "month"])

    # save this df into the output file
    merged_df.to_csv(out_path, index=False)

    # return the final df
    return merged_df




# Save those transformed files into google drive

In [10]:
transform_coe_prices(coe_inp_path, coe_out_path)


Unnamed: 0,date,coe_price_indicator
0,2021-01,0.251816
1,2021-02,0.208422
2,2021-03,0.185832
3,2021-04,0.572866
4,2021-05,0.424839
5,2021-06,0.454947
6,2021-07,0.425407
7,2021-08,0.476939
8,2021-09,0.559383
9,2021-10,0.849635


In [11]:
transform_stock_prices(stock_inp_path, stock_out_path)

Unnamed: 0,date,stock_price
0,2021-01,0.665302
1,2021-02,0.713294
2,2021-03,0.830401
3,2021-04,0.801463
4,2021-05,0.727715
5,2021-06,0.866463
6,2021-07,0.816763
7,2021-08,0.887964
8,2021-09,0.888712
9,2021-10,0.98898
