In [1]:
# It imports the Pandas library as "pd" alias and reads a CSV file located at '/content/drive/My Drive/stock_prices.csv' into a DataFrame named "stock_prices_df".
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd

# Define the file path in my Google Drive
file_path = '/content/drive/My Drive/stock_prices.csv'

# Read the CSV file into a DataFrame
stock_prices_df = pd.read_csv(file_path)

In [3]:
# Access the "Close" column of the DataFrame
close_values = stock_prices_df['Close']

# Print the first few values of the "Close" column
print(close_values.head())


print(close_values.describe())

0    2742.0
1     571.0
2    3210.0
3    1550.0
4    3330.0
Name: Close, dtype: float64
count    2.324923e+06
mean     2.594023e+03
std      3.576538e+03
min      1.400000e+01
25%      1.022000e+03
50%      1.811000e+03
75%      3.030000e+03
max      1.095500e+05
Name: Close, dtype: float64


In [4]:
# Define new class intervals based on quartiles or percentiles of Close prices
q1 = stock_prices_df['Close'].quantile(0.25)
q2 = stock_prices_df['Close'].quantile(0.50)
q3 = stock_prices_df['Close'].quantile(0.75)
class_intervals = [(0, q1), (q1, q2), (q2, q3), (q3, stock_prices_df['Close'].max() + 1)]  # Add 1 to include the maximum value

# Define function to assign classes based on target value
def assign_class(value):
    for i, interval in enumerate(class_intervals, start=1):
        if interval[0] <= value < interval[1]:  # Adjust the condition to include values in the third quartile
            return f'Class {i}'
    return 'Outside Range'

# Apply function to assign classes to each data point
stock_prices_df['Class'] = stock_prices_df['Close'].apply(assign_class)

# Display the DataFrame with assigned classes
print(stock_prices_df[['RowId', 'Close', 'Class']])


                 RowId   Close    Class
0        20170104_1301  2742.0  Class 3
1        20170104_1332   571.0  Class 1
2        20170104_1333  3210.0  Class 4
3        20170104_1376  1550.0  Class 2
4        20170104_1377  3330.0  Class 4
...                ...     ...      ...
2332526  20211203_9990   528.0  Class 1
2332527  20211203_9991   794.0  Class 1
2332528  20211203_9993  1645.0  Class 2
2332529  20211203_9994  2389.0  Class 3
2332530  20211203_9997   696.0  Class 1

[2332531 rows x 3 columns]


In [5]:
# Size of the dataset
dataset_size = stock_prices_df.shape
print("Dataset Size:", dataset_size)


Dataset Size: (2332531, 13)


In [6]:
import pandas as pd

# To make my life easier so I don't have to type out a longer variable name
df = stock_prices_df


In [7]:
import pandas as pd

# Convert 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])


In [8]:

# Check for duplicate records
duplicates = df.duplicated().sum()
print("Number of Duplicate Records:", duplicates)

Number of Duplicate Records: 0


In [9]:
# Check for format inconsistencies in RowId
row_id_format_inconsistencies = df[~df['RowId'].str.match(r'\d{8}_\d{4}$')]
print("Format Inconsistencies in RowId:")
print(row_id_format_inconsistencies)

# Check for format inconsistencies in SecuritiesCode
securities_code_format_inconsistencies = df[~df['SecuritiesCode'].astype(str).str.isdigit()]
print("\nFormat Inconsistencies in SecuritiesCode:")
print(securities_code_format_inconsistencies)

# Check for unexpected values in Open, High, Low, Close
price_columns = ['Open', 'High', 'Low', 'Close']
price_format_inconsistencies = df[~df[price_columns].applymap(lambda x: isinstance(x, (int, float)))].dropna()
print("\nFormat Inconsistencies in Price Columns:")
print(price_format_inconsistencies)

# Check for format inconsistencies in Volume
volume_format_inconsistencies = df[~df['Volume'].astype(str).str.isdigit()]
print("\nFormat Inconsistencies in Volume:")
print(volume_format_inconsistencies)

# Check for unexpected values in AdjustmentFactor
adjustment_factor_format_inconsistencies = df[~df['AdjustmentFactor'].apply(lambda x: isinstance(x, (int, float)))]
print("\nFormat Inconsistencies in AdjustmentFactor:")
print(adjustment_factor_format_inconsistencies)



# Check for unexpected values in Target
Target_inconsistencies = df[~df['Target'].isnull()]
print("\nFormat Inconsistencies in Target:")
print(Target_inconsistencies)



Format Inconsistencies in RowId:
Empty DataFrame
Columns: [RowId, Date, SecuritiesCode, Open, High, Low, Close, Volume, AdjustmentFactor, ExpectedDividend, SupervisionFlag, Target, Class]
Index: []

Format Inconsistencies in SecuritiesCode:
Empty DataFrame
Columns: [RowId, Date, SecuritiesCode, Open, High, Low, Close, Volume, AdjustmentFactor, ExpectedDividend, SupervisionFlag, Target, Class]
Index: []

Format Inconsistencies in Price Columns:
Empty DataFrame
Columns: [RowId, Date, SecuritiesCode, Open, High, Low, Close, Volume, AdjustmentFactor, ExpectedDividend, SupervisionFlag, Target, Class]
Index: []

Format Inconsistencies in Volume:
Empty DataFrame
Columns: [RowId, Date, SecuritiesCode, Open, High, Low, Close, Volume, AdjustmentFactor, ExpectedDividend, SupervisionFlag, Target, Class]
Index: []

Format Inconsistencies in AdjustmentFactor:
Empty DataFrame
Columns: [RowId, Date, SecuritiesCode, Open, High, Low, Close, Volume, AdjustmentFactor, ExpectedDividend, SupervisionFlag, Ta

In [10]:

# Missing Data
missing_data = df.isnull().sum()
print("Missing Data:\n", missing_data)

Missing Data:
 RowId                     0
Date                      0
SecuritiesCode            0
Open                   7608
High                   7608
Low                    7608
Close                  7608
Volume                    0
AdjustmentFactor          0
ExpectedDividend    2313666
SupervisionFlag           0
Target                  238
Class                     0
dtype: int64


In [11]:
df.drop(columns=['ExpectedDividend'], inplace=True)

In [12]:
# Calculate median for each column
median_open = df['Open'].median()
median_high = df['High'].median()
median_low = df['Low'].median()
median_close = df['Close'].median()
median_Target = df['Target'].median()


# Fill missing values with median
df['Open'].fillna(median_open, inplace=True)
df['High'].fillna(median_high, inplace=True)
df['Low'].fillna(median_low, inplace=True)
df['Close'].fillna(median_close, inplace=True)
df['Target'].fillna(median_Target, inplace=True)


In [13]:
# Recheck missingness
print("Missing Data After Imputation:\n", df.isnull().sum())

Missing Data After Imputation:
 RowId               0
Date                0
SecuritiesCode      0
Open                0
High                0
Low                 0
Close               0
Volume              0
AdjustmentFactor    0
SupervisionFlag     0
Target              0
Class               0
dtype: int64


In [14]:
# Define a function to detect outliers based on the IQR method
def detect_outliers(column):
    Q1 = column.quantile(0.25)
    Q3 = column.quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return (column < lower_bound) | (column > upper_bound)

# Apply outlier detection
outliers_open = detect_outliers(df['Open']).sum()
outliers_high = detect_outliers(df['High']).sum()
outliers_low = detect_outliers(df['Low']).sum()
outliers_close = detect_outliers(df['Close']).sum()
outliers_volume = detect_outliers(df['Volume']).sum()
outliers_adjustment_factor = detect_outliers(df['AdjustmentFactor']).sum()


outliers_summary = {
    'Open Outliers': outliers_open,
    'High Outliers': outliers_high,
    'Low Outliers': outliers_low,
    'Close Outliers': outliers_close,
    'Volume Outliers': outliers_volume,
    'AdjustmentFactor Outliers': outliers_adjustment_factor

}

outliers_summary

{'Open Outliers': 145183,
 'High Outliers': 145803,
 'Low Outliers': 144944,
 'Close Outliers': 145371,
 'Volume Outliers': 308239,
 'AdjustmentFactor Outliers': 730}

In [15]:
import pandas as pd



# Summary statistics for the 'Volume' column
volume_stats = df['Volume'].describe()

# Summary statistics for the 'AdjustmentFactor' column
adjustment_factor_stats = df['AdjustmentFactor'].describe()

# Display summary statistics
print("Summary Statistics for Volume Column:")
print(volume_stats)
print("\nSummary Statistics for AdjustmentFactor Column:")
print(adjustment_factor_stats)

# Skewness and kurtosis for both columns
volume_skewness = df['Volume'].skew()
volume_kurtosis = df['Volume'].kurtosis()

adjustment_factor_skewness = df['AdjustmentFactor'].skew()
adjustment_factor_kurtosis = df['AdjustmentFactor'].kurtosis()

print("\nSkewness for Volume Column:", volume_skewness)
print("Kurtosis for Volume Column:", volume_kurtosis)

print("\nSkewness for AdjustmentFactor Column:", adjustment_factor_skewness)
print("Kurtosis for AdjustmentFactor Column:", adjustment_factor_kurtosis)


Summary Statistics for Volume Column:
count    2.332531e+06
mean     6.919366e+05
std      3.911256e+06
min      0.000000e+00
25%      3.030000e+04
50%      1.071000e+05
75%      4.021000e+05
max      6.436540e+08
Name: Volume, dtype: float64

Summary Statistics for AdjustmentFactor Column:
count    2.332531e+06
mean     1.000508e+00
std      6.773040e-02
min      1.000000e-01
25%      1.000000e+00
50%      1.000000e+00
75%      1.000000e+00
max      2.000000e+01
Name: AdjustmentFactor, dtype: float64

Skewness for Volume Column: 36.042606002811546
Kurtosis for Volume Column: 2368.1649879155293

Skewness for AdjustmentFactor Column: 122.99565467169053
Kurtosis for AdjustmentFactor Column: 17008.27380321588


In [16]:
# Removing outliers from volume column

#Step 1: Calculate IQR
Q1 = df['Volume'].quantile(0.25)
Q3 = df['Volume'].quantile(0.75)
IQR = Q3 - Q1

# Step 2: Define lower and upper bounds
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Step 3: Filter DataFrame to exclude outliers
df_filtered = df[(df['Volume'] >= lower_bound) & (df['Volume'] <= upper_bound)]




In [17]:
# Summary statistics before outlier removal
print("Summary Statistics for Volume Column (Before):")
print(df['Volume'].describe())

# Summary statistics after outlier removal
print("\nSummary Statistics for Volume Column (After):")
print(df_filtered['Volume'].describe())

Summary Statistics for Volume Column (Before):
count    2.332531e+06
mean     6.919366e+05
std      3.911256e+06
min      0.000000e+00
25%      3.030000e+04
50%      1.071000e+05
75%      4.021000e+05
max      6.436540e+08
Name: Volume, dtype: float64

Summary Statistics for Volume Column (After):
count    2.024292e+06
mean     1.686656e+05
std      2.117761e+05
min      0.000000e+00
25%      2.470000e+04
50%      7.830000e+04
75%      2.249000e+05
max      9.598000e+05
Name: Volume, dtype: float64


In [18]:
import numpy as np

# Log transformation for Volume column
df['Volume_Log'] = np.log(df['Volume'])

# Display the first few rows to show all columns including the log-transformed Volume
print(df.head())

           RowId       Date  SecuritiesCode    Open    High     Low   Close  \
0  20170104_1301 2017-01-04            1301  2734.0  2755.0  2730.0  2742.0   
1  20170104_1332 2017-01-04            1332   568.0   576.0   563.0   571.0   
2  20170104_1333 2017-01-04            1333  3150.0  3210.0  3140.0  3210.0   
3  20170104_1376 2017-01-04            1376  1510.0  1550.0  1510.0  1550.0   
4  20170104_1377 2017-01-04            1377  3270.0  3350.0  3270.0  3330.0   

    Volume  AdjustmentFactor  SupervisionFlag    Target    Class  Volume_Log  
0    31400               1.0            False  0.000730  Class 3   10.354563  
1  2798500               1.0            False  0.012324  Class 1   14.844594  
2   270800               1.0            False  0.006154  Class 4   12.509136  
3    11300               1.0            False  0.011053  Class 2    9.332558  
4   150800               1.0            False  0.003026  Class 4   11.923710  


  result = getattr(ufunc, method)(*inputs, **kwargs)


In [19]:
# Min-Max Scaling (Normalization)
def min_max_scaling(feature):
    min_val = feature.min()
    max_val = feature.max()
    return (feature - min_val) / (max_val - min_val)

# Z-score Normalization (Standardization)
def z_score_normalization(feature):
    mean = feature.mean()
    std_dev = feature.std()
    return (feature - mean) / std_dev

# Apply Min-Max Scaling and Z-score Normalization to the dataset
normalized_df = pd.DataFrame()
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        normalized_df[col + '_minmax'] = min_max_scaling(df[col])
        normalized_df[col + '_zscore'] = z_score_normalization(df[col])

# Display the normalized dataset
print(normalized_df.head())

   SecuritiesCode_minmax  SecuritiesCode_zscore  Open_minmax  Open_zscore  \
0               0.000000              -1.910785     0.024742     0.039769   
1               0.003565              -1.897891     0.005039    -0.566676   
2               0.003680              -1.897475     0.028526     0.156243   
3               0.008625              -1.879589     0.013608    -0.302931   
4               0.008740              -1.879173     0.029617     0.189841   

   High_minmax  High_zscore  Low_minmax  Low_zscore  Close_minmax  \
0     0.024800     0.036263    0.025348    0.048551      0.024905   
1     0.005078    -0.566714    0.005131   -0.565678      0.005085   
2     0.028918     0.162171    0.029173    0.164764      0.029178   
3     0.013893    -0.297187    0.013966   -0.297254      0.014023   
4     0.030185     0.200913    0.030386    0.201612      0.030273   

   Close_zscore  Volume_minmax  Volume_zscore  AdjustmentFactor_minmax  \
0      0.042154       0.000049      -0.168881   

In [20]:
# One-hot encode categorical features
encoded_df = pd.get_dummies(df, columns=['SupervisionFlag'])

# Display the encoded DataFrame
print(encoded_df.head())


           RowId       Date  SecuritiesCode    Open    High     Low   Close  \
0  20170104_1301 2017-01-04            1301  2734.0  2755.0  2730.0  2742.0   
1  20170104_1332 2017-01-04            1332   568.0   576.0   563.0   571.0   
2  20170104_1333 2017-01-04            1333  3150.0  3210.0  3140.0  3210.0   
3  20170104_1376 2017-01-04            1376  1510.0  1550.0  1510.0  1550.0   
4  20170104_1377 2017-01-04            1377  3270.0  3350.0  3270.0  3330.0   

    Volume  AdjustmentFactor    Target    Class  Volume_Log  \
0    31400               1.0  0.000730  Class 3   10.354563   
1  2798500               1.0  0.012324  Class 1   14.844594   
2   270800               1.0  0.006154  Class 4   12.509136   
3    11300               1.0  0.011053  Class 2    9.332558   
4   150800               1.0  0.003026  Class 4   11.923710   

   SupervisionFlag_False  SupervisionFlag_True  
0                   True                 False  
1                   True                 False  

In [21]:
# Drop the specified columns
df = df.drop(columns=['RowId', 'SupervisionFlag'])

In [22]:
import pandas as pd

# Assuming df is your DataFrame
data_types = df.dtypes

# Print data types along with column names
print("Data Types:")
print(data_types)


Data Types:
Date                datetime64[ns]
SecuritiesCode               int64
Open                       float64
High                       float64
Low                        float64
Close                      float64
Volume                       int64
AdjustmentFactor           float64
Target                     float64
Class                       object
Volume_Log                 float64
dtype: object


In [23]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'Class' column and store the encoded values in a new column 'Class_encoded'
stock_prices_df['Class_encoded'] = label_encoder.fit_transform(stock_prices_df['Class'])

# Display the DataFrame with encoded classes
print(stock_prices_df[['RowId', 'Close', 'Class', 'Class_encoded']])

# Drop the original 'Class' column
stock_prices_df = stock_prices_df.drop(columns=['Class'])

# Verify column names after dropping 'Class'
print(stock_prices_df.columns)



                 RowId   Close    Class  Class_encoded
0        20170104_1301  2742.0  Class 3              2
1        20170104_1332   571.0  Class 1              0
2        20170104_1333  3210.0  Class 4              3
3        20170104_1376  1550.0  Class 2              1
4        20170104_1377  3330.0  Class 4              3
...                ...     ...      ...            ...
2332526  20211203_9990   528.0  Class 1              0
2332527  20211203_9991   794.0  Class 1              0
2332528  20211203_9993  1645.0  Class 2              1
2332529  20211203_9994  2389.0  Class 3              2
2332530  20211203_9997   696.0  Class 1              0

[2332531 rows x 4 columns]
Index(['RowId', 'Date', 'SecuritiesCode', 'Open', 'High', 'Low', 'Close',
       'Volume', 'AdjustmentFactor', 'SupervisionFlag', 'Target', 'Volume_Log',
       'Class_encoded'],
      dtype='object')


In [24]:
import math

# Calculate the total number of samples
N_total = df.shape[0]

# Specify the proportions for training, validation, and testing sets
train_ratio = 0.8
val_ratio = 0.1  # Splitting the remaining data equally for validation and testing
test_ratio = 0.1

# Calculate the number of samples for each set
N_train = math.floor(train_ratio * N_total)
N_val = math.floor(val_ratio * N_total)
N_test = N_total - N_train - N_val

# Split the dataset into training, validation, and testing sets
X_train = df.iloc[:N_train, :-1]  # Assuming the last column is the target column
y_train = df.iloc[:N_train, -1]   # Assuming the last column is the target column

X_val = df.iloc[N_train:N_train + N_val, :-1]
y_val = df.iloc[N_train:N_train + N_val, -1]

X_test = df.iloc[N_train + N_val:, :-1]
y_test = df.iloc[N_train + N_val:, -1]

# Print the number of samples in the validation set
print("Number of samples in the validation set:", N_val)

Number of samples in the validation set: 233253


In [25]:
N_train

1866024

In [26]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

model = Sequential()
model.add(Dense(12, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [27]:
X_train.shape[1]

10

In [28]:
# 1st Hidden layer
X_train.shape[1]*12 + 12

132

In [29]:
# 2nd Hidden layer
12 * 8 + 8

104

In [30]:
# 3rd Hidden layer
8 * 8 + 8

72

In [31]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 12)                132       
                                                                 
 dense_1 (Dense)             (None, 8)                 104       
                                                                 
 dense_2 (Dense)             (None, 8)                 72        
                                                                 
 dense_3 (Dense)             (None, 1)                 9         
                                                                 
Total params: 317 (1.24 KB)
Trainable params: 317 (1.24 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [32]:
from keras.models import Model
from keras.layers import Dense, Input

in_x = Input(shape = X_train.shape[1:])
x = Dense(12, input_dim=X_train.shape[1], activation='relu')(in_x)
x = Dense(8, activation='relu')(x)
x = Dense(8, activation='relu')(x)
out_x = Dense(1, activation='sigmoid')(x)

model = Model(in_x,out_x)

model.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 10)]              0         
                                                                 
 dense_4 (Dense)             (None, 12)                132       
                                                                 
 dense_5 (Dense)             (None, 8)                 104       
                                                                 
 dense_6 (Dense)             (None, 8)                 72        
                                                                 
 dense_7 (Dense)             (None, 1)                 9         
                                                                 
Total params: 317 (1.24 KB)
Trainable params: 317 (1.24 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [33]:
from keras.activations import relu
import keras

in_x = Input(shape = X_train.shape[1:])
x = keras.layers.Normalization(axis=-1)(in_x)
x = Dense(12, input_dim=X_train.shape[1], activation='relu')(in_x)
x = Dense(8)(x)
x = relu(x)
out_x = Dense(1, activation='sigmoid')(x)

model = Model(in_x,out_x)

model.summary()

Model: "model_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_2 (InputLayer)        [(None, 10)]              0         
                                                                 
 dense_8 (Dense)             (None, 12)                132       
                                                                 
 dense_9 (Dense)             (None, 8)                 104       
                                                                 
 tf.nn.relu (TFOpLambda)     (None, 8)                 0         
                                                                 
 dense_10 (Dense)            (None, 1)                 9         
                                                                 
Total params: 245 (980.00 Byte)
Trainable params: 245 (980.00 Byte)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [34]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Drop timestamp columns from X_train and X_Test
X_train_numeric = X_train.drop(columns=['timestamp_column1', 'timestamp_column2', ...], errors='ignore')
X_test_numeric = X_test.drop(columns=['timestamp_column1', 'timestamp_column2', ...], errors='ignore')

# Convert all columns to numeric (excluding the timestamp columns)
X_train_numeric = X_train_numeric.apply(pd.to_numeric, errors='ignore')
X_test_numeric = X_test_numeric.apply(pd.to_numeric, errors='ignore')

# Convert to NumPy array and cast to float
X_train_numeric_array = X_train_numeric.to_numpy().astype("float")
X_test_numeric_array = X_test_numeric.to_numpy().astype("float")

# Train the model
history = model.fit(X_train_numeric_array,
                    y_train.to_numpy().astype("float"),
                    validation_data=(X_test_numeric_array, y_test.to_numpy().astype("float")),
                    epochs=100,
                    batch_size=10)


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100

In [None]:
# Drop timestamp columns from X_train and X_Test
X_train_numeric = X_train.drop(columns=['timestamp_column1', 'timestamp_column2', ...], errors='ignore')
X_test_numeric = X_test.drop(columns=['timestamp_column1', 'timestamp_column2', ...], errors='ignore')

# Convert all columns to numeric (excluding the timestamp columns)
X_train_numeric = X_train_numeric.apply(pd.to_numeric, errors='ignore')
X_test_numeric = X_test_numeric.apply(pd.to_numeric, errors='ignore')

# Convert to NumPy array and cast to float
X_train_numeric_array = X_train_numeric.to_numpy().astype("float")
X_test_numeric_array = X_test_numeric.to_numpy().astype("float")

# Continue training the model from epoch 38 to epoch 100
history = model.fit(X_train_numeric_array,
                    y_train.to_numpy().astype("float"),
                    validation_data=(X_test_numeric_array, y_test.to_numpy().astype("float")),
                    epochs=100,  # Set to the remaining epochs you want to train
                    initial_epoch=36,  # Start from epoch 38
                    batch_size=10)





Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100

In [49]:
# Drop timestamp columns from X_train and X_Test
X_train_numeric = X_train.drop(columns=['timestamp_column1', 'timestamp_column2', ...], errors='ignore')
X_test_numeric = X_test.drop(columns=['timestamp_column1', 'timestamp_column2', ...], errors='ignore')

# Convert all columns to numeric (excluding the timestamp columns)
X_train_numeric = X_train_numeric.apply(pd.to_numeric, errors='ignore')
X_test_numeric = X_test_numeric.apply(pd.to_numeric, errors='ignore')

# Convert to NumPy array and cast to float
X_train_numeric_array = X_train_numeric.to_numpy().astype("float")
X_test_numeric_array = X_test_numeric.to_numpy().astype("float")

# Continue training the model from epoch 38 to epoch 100
history = model.fit(X_train_numeric_array,
                    y_train.to_numpy().astype("float"),
                    validation_data=(X_test_numeric_array, y_test.to_numpy().astype("float")),
                    epochs=100,  # Set to the remaining epochs you want to train
                    initial_epoch=83,
                    batch_size=10)


Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [25]:
from sklearn.preprocessing import LabelEncoder

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Encode the 'Class' column
df['Class_encoded'] = label_encoder.fit_transform(df['Class'])


In [26]:
# Drop the 'Class' column
df = df.drop(columns=['Class'])


In [None]:
# Comparison with my previous model

In [None]:
import numpy as np

# Define the Node class for the Decision Tree
class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, value=None):
        self.feature = feature  # Feature index for splitting
        self.threshold = threshold  # Threshold for splitting
        self.left = left  # Left child node
        self.right = right  # Right child node
        self.value = value  # Value for leaf nodes

# Define the Decision Tree class
class DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None

    def fit(self, X, y):
        self.root = self._grow_tree(X, y.astype(int))


    def predict(self, X):
        return np.array([self._predict_row(x, self.root) for x in X])

    def _grow_tree(self, X, y, depth=0):
        n_samples, n_features = X.shape
        n_classes = len(np.unique(y))

        # Stopping criteria
        if (self.max_depth is not None and depth >= self.max_depth) or n_classes == 1 or n_samples < self.min_samples_split:
            value = np.bincount(y).argmax()
            return Node(value=value)

        # Find the best split
        best_gini = float('inf')
        best_feature, best_threshold = None, None
        for feature in range(n_features):
            thresholds = np.unique(X[:, feature])
            for threshold in thresholds:
                left_indices = np.where(X[:, feature] <= threshold)[0]
                right_indices = np.where(X[:, feature] > threshold)[0]

                if len(left_indices) == 0 or len(right_indices) == 0:
                    continue

                gini = self._gini_impurity(y[left_indices], y[right_indices])
                if gini < best_gini:
                    best_gini = gini
                    best_feature = feature
                    best_threshold = threshold

        # Split the dataset
        left_indices = np.where(X[:, best_feature] <= best_threshold)[0]
        right_indices = np.where(X[:, best_feature] > best_threshold)[0]
        left_child = self._grow_tree(X[left_indices], y[left_indices], depth + 1)
        right_child = self._grow_tree(X[right_indices], y[right_indices], depth + 1)

        return Node(feature=best_feature, threshold=best_threshold, left=left_child, right=right_child)

    def _predict_row(self, x, node):
        if node.value is not None:
            return node.value
        if x[node.feature] <= node.threshold:
            return self._predict_row(x, node.left)
        else:
            return self._predict_row(x, node.right)

    def _gini_impurity(self, left_y, right_y):
        n = len(left_y) + len(right_y)
        p_left = len(left_y) / n
        p_right = len(right_y) / n
        return p_left * self._calc_gini(left_y) + p_right * self._calc_gini(right_y)

    def _calc_gini(self, y):
        if len(y) == 0:
            return 0
        p = np.bincount(np.round(y).astype(int)) / len(y)
        return 1 - np.sum(p ** 2)


# Define the size of the test set (e.g., 20% of the total data)
test_size = 0.2

# Split the data into training and test sets
train_data = df.sample(frac=1-test_size, random_state=42)  # Use 80% of the data for training
test_data = df.drop(train_data.index)  # Use the remaining 20% for testing

# Example usage:
# Instantiate and train the model
tree = DecisionTree(max_depth=3)
X_train = train_data.drop(columns=['Close']).values
y_train = train_data['Close'].values
tree.fit(X_train, y_train)

# Make predictions
X_test = test_data.drop(columns=['Close']).values
predictions = tree.predict(X_test)
print(predictions)