In [58]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, MinMaxScaler


### Export Data

In [59]:
df = pd.read_parquet("./stock_1d.parquet")
df.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,2013-01-02,94.190002,94.790001,93.959999,94.779999,67.895119,3206700.0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,2013-01-03,94.339996,94.93,94.129997,94.669998,67.816322,2704600.0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
2,2013-01-04,94.790001,95.480003,94.540001,95.370003,68.317757,2704900.0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
3,2013-01-07,95.019997,95.730003,94.760002,95.489998,68.403717,2745800.0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
4,2013-01-08,95.169998,95.75,95.099998,95.5,68.410889,2655500.0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902


In [60]:
df1 = pd.read_parquet("./stock_1h.parquet")
df1.head()

Unnamed: 0,index,Open,High,Low,Close,Adj Close,Volume,Symbol,Date,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,2022-01-03 09:30:00-05:00,178.320007,179.089996,176.429993,177.214996,177.214996,384623.0,MMM,,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,2022-01-03 10:30:00-05:00,177.179993,177.199997,176.244995,176.820007,176.820007,158291.0,MMM,,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
2,2022-01-03 11:30:00-05:00,176.759995,176.785004,176.139999,176.529999,176.529999,125963.0,MMM,,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
3,2022-01-03 12:30:00-05:00,176.570007,176.580002,175.839996,176.440002,176.440002,123903.0,MMM,,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
4,2022-01-03 13:30:00-05:00,176.449997,177.259995,176.220001,177.089996,177.089996,116232.0,MMM,,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902


In [61]:
df2 = pd.read_parquet("./stock_1m.parquet")
df2.head()

Unnamed: 0,Date,Open,High,Low,Close,Adj Close,Volume,Symbol,Datetime,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,,88.919998,89.059998,88.919998,88.980003,88.980003,133052.0,MMM,2023-10-16 09:30:00-04:00,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,,89.029999,89.25,89.029999,89.212601,89.212601,4748.0,MMM,2023-10-16 09:31:00-04:00,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
2,,89.260002,89.260002,89.116798,89.165001,89.165001,3267.0,MMM,2023-10-16 09:32:00-04:00,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
3,,89.199997,89.254997,89.135002,89.190002,89.190002,2168.0,MMM,2023-10-16 09:33:00-04:00,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
4,,89.125,89.18,89.044998,89.18,89.18,4947.0,MMM,2023-10-16 09:34:00-04:00,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902


## Data Cleaning ##

In [62]:
print(df.isnull().sum())

Date                         0
Open                        14
High                        14
Low                         14
Close                       14
Adj Close                   14
Volume                      14
Symbol                       0
Security                     0
GICS Sector                  0
GICS Sub-Industry            0
Headquarters Location        0
Date added               27330
CIK                          0
Founded                      0
dtype: int64


In [63]:
print(df1.isnull().sum())

index                          0
Open                          36
High                          36
Low                           36
Close                         36
Adj Close                     36
Volume                        36
Symbol                         0
Date                     1606007
Security                       0
GICS Sector                    0
GICS Sub-Industry              0
Headquarters Location          0
Date added                 32560
CIK                            0
Founded                        0
dtype: int64


In [64]:
print(df2.isnull().sum())

Date                     2733118
Open                           8
High                           8
Low                            8
Close                          8
Adj Close                      8
Volume                         8
Symbol                         0
Datetime                       0
Security                       0
GICS Sector                    0
GICS Sub-Industry              0
Headquarters Location          0
Date added                 55997
CIK                            0
Founded                        0
dtype: int64


In [65]:
# Handle missing values (dropping)
df = df.dropna()
df1 = df1.dropna()
df2 = df2.dropna()

## Standardization ##

In [66]:
numerical_columns_df = df.select_dtypes(include=['float64', 'int64']).columns
numerical_columns_df1 = df1.select_dtypes(include=['float64', 'int64']).columns
numerical_columns_df2 = df2.select_dtypes(include=['float64', 'int64']).columns

if df.empty:
    print("The DataFrame is empty.")
else:
    if df[numerical_columns_df].dropna().empty:
        print("The selected columns for scaling are empty or contain only NaN values.")
    else:
        scaler = StandardScaler()
        df[numerical_columns_df] = scaler.fit_transform(df[numerical_columns_df])
        print("Standardization applied successfully.")
        
if df1.empty:
    print("The DataFrame 1 is empty.")
else:
    if df1[numerical_columns_df1].dropna().empty:
        print("The selected columns for scaling are empty or contain only NaN values.")
    else:
        scaler = StandardScaler()
        df1[numerical_columns_df1] = scaler.fit_transform(df1[numerical_columns_df1])
        print("Standardization applied successfully.")
        
if df1.empty:
    print("The DataFrame 2 is empty.")
else:
    if df2[numerical_columns_df2].dropna().empty:
        print("The selected columns for scaling are empty or contain only NaN values.")
    else:
        scaler = StandardScaler()
        df2[numerical_columns_df2] = scaler.fit_transform(df2[numerical_columns_df2])
        print("Standardization applied successfully.")

Standardization applied successfully.
The DataFrame 1 is empty.
The DataFrame 2 is empty.
