In [65]:
import pandas as pd
import numpy as np
from datetime import datetime
from google.cloud import storage # save the model to GCS

In [122]:
# Data stored in GCP, bucket name: 'gold-price-prediction' van TawabG
gold_dataset = pd.read_csv("gs://gold-price-prediction/gold_dataset.csv") 

In [99]:
def pre_processing(gold_dataset):
    
    gold_dataset = gold_dataset.dropna()
    gold_dataset['date'] = pd.to_datetime(gold_dataset['date'], errors='coerce')
    gold_dataset['date'] = gold_dataset['date'].dt.date
    gold_dataset = gold_dataset.set_index('date')
    
    return gold_dataset

In [181]:
preprocessed_gold_dataset = pre_processing(gold_dataset)
preprocessed_gold_dataset['price'].std()

87.64811045891085

In [191]:
def create_features_dataframe(dataset):
    
    gold_features_df = pd.DataFrame()
    
    average_days_window_closing_price = [5, 30]
    for window in average_days_window_closing_price:
        gold_features_df['Mean__'+str(window)+'_days'] = dataset['price'].rolling(window).mean().shift(periods=1)
        gold_features_df['Std__'+str(window)+'_days'] = dataset['price'].rolling(window).std().shift(periods=1)
        gold_features_df['Max__'+str(window)+'_days'] = dataset['price'].rolling(window).max().shift(periods=1)
        gold_features_df['Min__'+str(window)+'_days'] = dataset['price'].rolling(window).min().shift(periods=1)
        gold_features_df['Min__'+str(window)+'_days'] = dataset['price'].rolling(window).sum().shift(periods=1)
        
    # get day of the week
    dataset['date'] = dataset.index
    dataset['day_of_week'] = dataset['date'].apply(lambda x: x.strftime("%A"))

    # get quarter
    dataset['month'] = dataset['date'].apply(lambda x: x.strftime("%B"))
    
    gold_features_df = pd.concat([gold_features_df, pd.get_dummies(dataset['day_of_week']), pd.get_dummies(dataset['month'])], 1)

    return gold_features_df

In [192]:
features_df = create_features_dataframe(preprocessed_gold_dataset)
features_df.tail()

Unnamed: 0_level_0,Mean__5_days,Std__5_days,Max__5_days,Min__5_days,Mean__30_days,Std__30_days,Max__30_days,Min__30_days,Friday,Monday,...,August,February,January,July,June,March,May,November,October,September
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2020-11-15,1589.20476,3.737047,1594.5946,7946.0238,1614.277527,16.285805,1647.2692,48428.3258,0,0,...,0,0,0,0,0,0,0,1,0,0
2020-11-16,1589.57522,3.656623,1594.5946,7947.8761,1613.091833,16.748098,1647.2692,48392.755,0,1,...,0,0,0,0,0,0,0,1,0,0
2020-11-17,1590.62248,5.656514,1599.8309,7953.1124,1612.2447,16.754918,1647.2692,48367.341,0,0,...,0,0,0,0,0,0,0,1,0,0
2020-11-18,1591.12462,5.026801,1599.8309,7955.6231,1610.9622,17.192327,1647.2692,48328.866,0,0,...,0,0,0,0,0,0,0,1,0,0
2020-11-19,1589.53628,6.606258,1599.8309,7947.6814,1609.663137,17.889077,1647.2692,48289.8941,0,0,...,0,0,0,0,0,0,0,1,0,0
