# Set up enviroment

In [153]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import math
import warnings
import joblib
warnings.filterwarnings('ignore')
import os


In [154]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


# Get raw data

In [155]:
tokens = ["btc", "eth", "xrp"]
train_test_split = ["721", "622", "811"]

prj_path = "/content/drive/My Drive/Report_DBA/final_report"
data_path = os.path.join(prj_path, "data/")
prj_path_opt= os.path.join(prj_path, "optimize_hyperparam/opt_results/")
processed_output_dir = os.path.join(prj_path,  "processing/")

os.makedirs(processed_output_dir, exist_ok = True)
targets = ["Close"]

data_set = {1: "train", 2: "test", 3: "validation"}
token_id = 0
n_split = 0 # 721
# n_split = 1 # 622
# n_split = 2 # 811

raw_data_file=f"{tokens[token_id]}.csv"

In [156]:
%cd $prj_path

/content/drive/My Drive/Report_DBA/final_report


In [157]:
# xlsx
def getData():
  df = pd.read_csv(data_path + raw_data_file,  index_col="Date")
  colsFeature = ["Close"]

  df_target = df[colsFeature]
  return df_target


# Data pre-processing

In [158]:
df = getData()

In [159]:
df.head()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2017-12-01,10975.599609
2017-12-02,11074.599609
2017-12-03,11323.200195
2017-12-04,11657.200195
2017-12-05,11916.700195


In [160]:
df.tail()

Unnamed: 0_level_0,Close
Date,Unnamed: 1_level_1
2023-05-28,28085.646484
2023-05-29,27745.884766
2023-05-30,27702.349609
2023-05-31,27219.658203
2023-06-01,26925.988281


In [161]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2009 entries, 2017-12-01 to 2023-06-01
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Close   2009 non-null   float64
dtypes: float64(1)
memory usage: 31.4+ KB


In [162]:
df.describe()

Unnamed: 0,Close
count,2009.0
mean,20615.523264
std,16287.001871
min,3236.761719
25%,8079.862793
50%,11966.407227
75%,30323.722656
max,67566.828125


In [163]:
df.isna().sum()

Close    0
dtype: int64

--> Nhận thấy: Không có missing data

## Normalization

In [164]:
def normalizationMinMax(df, data_set_index):
  """
  Params:
      * df -- DataFrame:        Dataframe need to be scaled
      * data_set_index -- int:  A number to recognize that dataset is train/testvalidation
  Returns:
      * scaler:     used for future inverse-transform
      * df_scaled:  df after normalization

  """
  scaler = MinMaxScaler()
  norm_set = data_set[data_set_index]

  processes_folder = f"{processed_output_dir}/{tokens[token_id]}"
  os.makedirs(processes_folder, exist_ok=True)

  if norm_set == "train":
    scaler.fit(df)
    series = scaler.transform(df)
    df_scaled = pd.DataFrame(data=series, columns = df.columns)
    out_scaler_file = os.path.join(processed_output_dir, f"{tokens[token_id]}/{train_test_split[n_split]}/{targets[0]}_{norm_set}_scalerMinMaxNorm.save")
    joblib.dump(scaler, out_scaler_file)
    print(os.path.exists(out_scaler_file))

  else:
    scaler = joblib.load(os.path.join(processed_output_dir, f"{tokens[token_id]}/{train_test_split[n_split]}/{targets[0]}_train_scalerMinMaxNorm.save"))
    series = scaler.transform(df)
    df_scaled = pd.DataFrame(data = series, columns = df.columns)

  out_file = os.path.join(processed_output_dir, f"{tokens[token_id]}/{train_test_split[n_split]}/{targets[0]}_{norm_set}_scaler.xlsx")
  df_scaled.to_excel(out_file, float_format='%.5f')
  print(out_file)
  return df_scaled,scaler

# Start Process

## Split train + test + validation

In [165]:
n_observations = df.shape[0]

train_ratio = 0.7
test_ratio = 0.2

train_size = int(n_observations*train_ratio)
test_size = int(n_observations*test_ratio)
val_size = n_observations - train_size - test_size

train = df[:train_size]
test = df[train_size: train_size + test_size]
val = df[-val_size:]

In [166]:
# MinMaxScaler
train_scaled, scaler = normalizationMinMax(train, 1)
test_scaled, scaler = normalizationMinMax(test, 2)
val_scaled, scaler = normalizationMinMax(val, 3)

True
/content/drive/My Drive/Report_DBA/final_report/processing/btc/721/Close_train_scaler.xlsx
/content/drive/My Drive/Report_DBA/final_report/processing/btc/721/Close_test_scaler.xlsx
/content/drive/My Drive/Report_DBA/final_report/processing/btc/721/Close_validation_scaler.xlsx
