# AI Algorithms
> By Sayed Afnan Khazi(01JST21CB036), Venkat Bhaskar(01JST21CB049), G Rutvik(01JST21CB012), Sai Sujith(01JST21CB033)
- This notebook showcases the implementation of Artificial Neural Networks, K-Nearest Neighbors, Decision Trees, Random Forest, Support Vector Machines, and Reinforcement Learning (Q-learning) on a financial dataset containing Alibaba's stock prices for the past many years.

### Importing Libraries

In [1]:
# Install all the requirements
%pip install -r requirements.txt


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip available: [0m[31;49m22.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [2]:
# Importing all the requirements. 
import numpy as np
import pandas as pd
import random
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

2024-05-26 22:11:16.494295: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
final_results_table = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])

### Algorithm 1: Artificial Neural Networks

In [4]:
# Load data from a CSV file
df = pd.read_csv('baba_stock_data.csv')

# Ensure 'Date' column is in datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Sorting data by date
df = df.sort_values(by='Date')

# Calculate the daily movement (1 for up, 0 for down)
df['Movement'] = (df['Close'].diff() > 0).astype(int)

# Remove the first row with NaN value due to diff()
df = df.dropna()

# Feature selection
features = ['Open', 'High', 'Low', 'Close', 'Adj Close']
X = df[features]
y = df['Movement']

# training-testing ratios
ratios = [0.4,0.3,0.2,0.1] 

results = {}

for ratio in ratios:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=0)

    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize the ANN
    model = Sequential()
    model.add(Dense(units=64, activation='relu', input_dim=X_train.shape[1]))
    model.add(Dense(units=32, activation='relu'))
    model.add(Dense(units=1, activation='sigmoid'))

    # Compile the ANN
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the ANN
    model.fit(X_train, y_train, epochs=50, batch_size=10, verbose=1)

    # Predictions
    y_pred = model.predict(X_test)
    y_pred = (y_pred > 0.5).astype(int)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for split {ratio}:\n", cm)

    # Accuracy, Precision, Recall, F1-score
    accuracy_q = accuracy_score(y_test, y_pred)
    precision_q = precision_score(y_test, y_pred)
    recall_q = recall_score(y_test, y_pred)
    f1_q = f1_score(y_test, y_pred)

    results[ratio] = {'precision': precision_q, 'accuracy': accuracy_q, 'recall': recall_q, 'f1': f1_q}

# Detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get the most recent data point and format it as a DataFrame
last_data_point = df[features].iloc[-1].to_frame().T

# Standardize the last data point
last_data_point_scaled = scaler.transform(last_data_point)

# Predict the movement for the next day
predicted_movement = model.predict(last_data_point_scaled)
predicted_movement = (predicted_movement > 0.5).astype(int)

# Interpret the result
movement_label = "Up" if predicted_movement[0][0] == 1 else "Down"
print(f'The predicted movement for the next trading day is: {movement_label}')

# Print results
print("Results:")
print("Ratio Precision Accuracy Recall F1")
for ratio, metrics in results.items():
    print(f"{ratio:}\t{metrics['precision']:.4f}\t{metrics['accuracy']:.4f}\t{metrics['recall']:.4f}\t{metrics['f1']:.4f}")
print("Average F1",round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Accuracy",round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Precision",round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Recall",round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4))

final_results_table.loc[len(final_results_table)] = ['ANN', 
                                                     round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4)
                                                    ]

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6281 - loss: 0.6826
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5915 - loss: 0.6886
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5789 - loss: 0.6840
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5810 - loss: 0.6788
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6472 - loss: 0.6647
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6446 - loss: 0.6531
Epoch 7/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6355 - loss: 0.6621
Epoch 8/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.6295 - loss: 0.6529
Epoch 9/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - accuracy: 0.4868 - loss: 0.6951
Epoch 2/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5766 - loss: 0.6877
Epoch 3/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5450 - loss: 0.6882
Epoch 4/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5505 - loss: 0.6839
Epoch 5/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5948 - loss: 0.6762
Epoch 6/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5843 - loss: 0.6759
Epoch 7/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5886 - loss: 0.6683
Epoch 8/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.5771 - loss: 0.6687
Epoch 9/50
[1m42/42[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - accuracy: 0.5673 - loss: 0.6886
Epoch 2/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6207 - loss: 0.6807
Epoch 3/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6180 - loss: 0.6750
Epoch 4/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6449 - loss: 0.6490
Epoch 5/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6505 - loss: 0.6445
Epoch 6/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6493 - loss: 0.6353
Epoch 7/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6607 - loss: 0.6248
Epoch 8/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6729 - loss: 0.6140
Epoch 9/50
[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.4879 - loss: 0.6989
Epoch 2/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5838 - loss: 0.6794
Epoch 3/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5682 - loss: 0.6885
Epoch 4/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5598 - loss: 0.6854
Epoch 5/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.5996 - loss: 0.6750
Epoch 6/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6117 - loss: 0.6649
Epoch 7/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6463 - loss: 0.6540
Epoch 8/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.6065 - loss: 0.6605
Epoch 9/50
[1m54/54[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

### Algorithm 2: K-Nearest Neighbors

In [5]:

# Load data from a CSV file
df = pd.read_csv('baba_stock_data.csv')

# Ensure 'Date' column is in datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Sorting data by date
df = df.sort_values(by='Date')

# Calculate the daily movement (1 for up, 0 for down)
df['Movement'] = (df['Close'].diff() > 0).astype(int)

# Remove the first row with NaN value due to diff()
df = df.dropna()

# Feature selection
features = ['Open', 'High', 'Low', 'Close', 'Adj Close']
X = df[features]
y = df['Movement']

# training-testing ratios
ratios = [0.4,0.3,0.2,0.1] 

results = {}

for ratio in ratios:

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=0)

    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize and train the KNN classifier
    knn = KNeighborsClassifier(n_neighbors=5)
    knn.fit(X_train, y_train)

    # Make predictions
    y_pred = knn.predict(X_test)

    # Evaluate the model
    print(f"Confusion Matrix for ratio {ratio}:")
    print(confusion_matrix(y_test, y_pred))

    # Calculate metrics
    precision_q = precision_score(y_test, y_pred)
    accuracy_q = accuracy_score(y_test, y_pred)
    recall_q = recall_score(y_test, y_pred)
    f1_q = f1_score(y_test, y_pred)
    
    results[ratio] = {'precision': precision_q, 'accuracy': accuracy_q, 'recall': recall_q, 'f1': f1_q}

# Print results
print("Results:")
print("Ratio Precision Accuracy Recall F1")
for ratio, metrics in results.items():
    print(f"{ratio:}\t{metrics['precision']:.4f}\t{metrics['accuracy']:.4f}\t{metrics['recall']:.4f}\t{metrics['f1']:.4f}")
print("Average F1",round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Accuracy",round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Precision",round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Recall",round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4))

final_results_table.loc[len(final_results_table)] = ['KNN', 
                                                     round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4)
                                                    ]

Confusion Matrix for ratio 0.4:
[[84 33]
 [66 54]]
Confusion Matrix for ratio 0.3:
[[59 31]
 [45 43]]
Confusion Matrix for ratio 0.2:
[[44 20]
 [31 24]]
Confusion Matrix for ratio 0.1:
[[24  6]
 [15 15]]
Results:
Ratio Precision Accuracy Recall F1
0.4	0.6207	0.5823	0.4500	0.5217
0.3	0.5811	0.5730	0.4886	0.5309
0.2	0.5455	0.5714	0.4364	0.4848
0.1	0.7143	0.6500	0.5000	0.5882
Average F1 0.5314
Average Accuracy 0.5942
Average Precision 0.6154
Average Recall 0.4688


### Algorithm 3: Decision Trees

In [6]:
# Load data from a CSV file
df = pd.read_csv('baba_stock_data.csv')

# Ensure 'Date' column is in datetime format (if not already)
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Sorting data by date
df = df.sort_values(by='Date')

# Calculate the daily movement (1 for up, 0 for down)
df['Movement'] = (df['Close'].diff() > 0).astype(int)

# Remove the first row with NaN value due to diff()
df = df.dropna()

# Feature selection
features = ['Open', 'High', 'Low', 'Close', 'Adj Close']
X = df[features]
y = df['Movement']

# training-testing ratios
ratios = [0.4,0.3,0.2,0.1] 

results = {}

for ratio in ratios:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=0)


    # Model training
    classifier = DecisionTreeClassifier(criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)

    # Predictions
    y_pred = classifier.predict(X_test)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for ratio {ratio}:\n", cm)

    # Calculate metrics
    precision_q = precision_score(y_test, y_pred)
    accuracy_q = accuracy_score(y_test, y_pred)
    recall_q = recall_score(y_test, y_pred)
    f1_q = f1_score(y_test, y_pred)
    
    results[ratio] = {'precision': precision_q, 'accuracy': accuracy_q, 'recall': recall_q, 'f1': f1_q}


# Detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get the most recent data point and format it as a DataFrame
last_data_point = df[features].iloc[-1].to_frame().T

# Predict the movement for the next day
predicted_movement = classifier.predict(last_data_point)

# Interpret the result
movement_label = "Up" if predicted_movement[0] == 1 else "Down"
print(f'The predicted movement for the next trading day is: {movement_label}')

# Print results
print("Results:")
print("Ratio Precision Accuracy Recall F1")
for ratio, metrics in results.items():
    print(f"{ratio:}\t{metrics['precision']:.4f}\t{metrics['accuracy']:.4f}\t{metrics['recall']:.4f}\t{metrics['f1']:.4f}")
print("Average F1",round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Accuracy",round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Precision",round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Recall",round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4))

final_results_table.loc[len(final_results_table)] = ['Decision Tree', 
                                                     round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4)
                                                    ]

Confusion Matrix for ratio 0.4:
 [[79 38]
 [54 66]]
Confusion Matrix for ratio 0.3:
 [[61 29]
 [42 46]]
Confusion Matrix for ratio 0.2:
 [[43 21]
 [25 30]]
Confusion Matrix for ratio 0.1:
 [[23  7]
 [11 19]]

Classification Report:
               precision    recall  f1-score   support

           0       0.68      0.77      0.72        30
           1       0.73      0.63      0.68        30

    accuracy                           0.70        60
   macro avg       0.70      0.70      0.70        60
weighted avg       0.70      0.70      0.70        60

The predicted movement for the next trading day is: Up
Results:
Ratio Precision Accuracy Recall F1
0.4	0.6346	0.6118	0.5500	0.5893
0.3	0.6133	0.6011	0.5227	0.5644
0.2	0.5882	0.6134	0.5455	0.5660
0.1	0.7308	0.7000	0.6333	0.6786
Average F1 0.5996
Average Accuracy 0.6316
Average Precision 0.6417
Average Recall 0.5629


### Algorithm 4: Random Forest

In [7]:
# Load data from a CSV file
df = pd.read_csv('baba_stock_data.csv')

# Ensure 'Date' column is in datetime format (if not already)
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Sorting data by date
df = df.sort_values(by='Date')

# Calculate the daily movement (1 for up, 0 for down)
df['Movement'] = (df['Close'].diff() > 0).astype(int)

# Remove the first row with NaN value due to diff()
df = df.dropna()

# Feature selection
features = ['Open', 'High', 'Low', 'Close', 'Adj Close']
X = df[features]
y = df['Movement']

# training-testing ratios
ratios = [0.4,0.3,0.2,0.1] 

results = {}

for ratio in ratios:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=ratio, random_state=0)

    # Model training
    classifier = RandomForestClassifier(n_estimators=100, criterion='entropy', random_state=0)
    classifier.fit(X_train, y_train)

    # Predictions
    y_pred = classifier.predict(X_test)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for ratio {ratio}:\n", cm)

    # Calculate metrics
    precision_q = precision_score(y_test, y_pred)
    accuracy_q = accuracy_score(y_test, y_pred)
    recall_q = recall_score(y_test, y_pred)
    f1_q = f1_score(y_test, y_pred)
    
    results[ratio] = {'precision': precision_q, 'accuracy': accuracy_q, 'recall': recall_q, 'f1': f1_q}


# Detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get the most recent data point and format it as a DataFrame
last_data_point = df[features].iloc[-1].to_frame().T

# Predict the movement for the next day
predicted_movement = classifier.predict(last_data_point)

# Interpret the result
movement_label = "Up" if predicted_movement[0] == 1 else "Down"
print(f'The predicted movement for the next trading day is: {movement_label}')

# Print results
print("Results:")
print("Ratio Precision Accuracy Recall F1")
for ratio, metrics in results.items():
    print(f"{ratio:}\t{metrics['precision']:.4f}\t{metrics['accuracy']:.4f}\t{metrics['recall']:.4f}\t{metrics['f1']:.4f}")
print("Average F1",round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Accuracy",round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Precision",round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Recall",round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4))

final_results_table.loc[len(final_results_table)] = ['Random Forest', 
                                                     round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4)
                                                    ]

Confusion Matrix for ratio 0.4:
 [[85 32]
 [60 60]]
Confusion Matrix for ratio 0.3:
 [[62 28]
 [41 47]]
Confusion Matrix for ratio 0.2:
 [[40 24]
 [27 28]]
Confusion Matrix for ratio 0.1:
 [[25  5]
 [13 17]]

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.83      0.74        30
           1       0.77      0.57      0.65        30

    accuracy                           0.70        60
   macro avg       0.72      0.70      0.69        60
weighted avg       0.72      0.70      0.69        60

The predicted movement for the next trading day is: Up
Results:
Ratio Precision Accuracy Recall F1
0.4	0.6522	0.6118	0.5000	0.5660
0.3	0.6267	0.6124	0.5341	0.5767
0.2	0.5385	0.5714	0.5091	0.5234
0.1	0.7727	0.7000	0.5667	0.6538
Average F1 0.58
Average Accuracy 0.6239
Average Precision 0.6475
Average Recall 0.5275


### Algorithm 5: Support Vector Machines

In [8]:
# Load data from a CSV file
df = pd.read_csv('baba_stock_data.csv')

# Ensure ’Date’ column is in datetime format
df['Date'] = pd.to_datetime(df['Date'], format='%d-%m-%Y')

# Sorting data by date
df = df.sort_values(by='Date')

# Calculate the daily movement (1 for up, 0 for down)
df['Movement'] = (df['Close'].diff() > 0).astype(int)

# Remove the first row with NaN value due to diff()
df = df.dropna()

# Feature selection
features = ['Open', 'High', 'Low', 'Close', 'Adj Close']
X = df[features]
y = df['Movement']

# training-testing ratios
ratios = [0.4,0.3,0.2,0.1] 

results = {}

for ratio in ratios:
    X_train, X_test, y_train, y_test =train_test_split(X,y,test_size=ratio,random_state=0)


    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Initialize and train the SVM classifier
    svm_classifier = SVC(kernel='linear', random_state=0)
    svm_classifier.fit(X_train, y_train)

    # Predictions
    y_pred = svm_classifier.predict(X_test)

    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print(f"Confusion Matrix for ratio {ratio}:\n", cm)

    # Calculate metrics
    precision_q = precision_score(y_test, y_pred)
    accuracy_q = accuracy_score(y_test, y_pred)
    recall_q = recall_score(y_test, y_pred)
    f1_q = f1_score(y_test, y_pred)
    
    results[ratio] = {'precision': precision_q, 'accuracy': accuracy_q, 'recall': recall_q, 'f1': f1_q}


# Detailed classification report
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Get the most recent data point and format it as a DataFrame
last_data_point = df[features].iloc[-1].to_frame().T

# Standardize the last data point
last_data_point_scaled = scaler.transform(last_data_point)

# Predict the movement for the next day
predicted_movement = svm_classifier.predict(last_data_point_scaled)

# Interpret the result
movement_label = "Up" if predicted_movement[0] == 1 else "Down"
print(f'The predicted movement for the next trading day is:{movement_label}')

# Print results
print("Results:")
print("Ratio Precision Accuracy Recall F1")
for ratio, metrics in results.items():
    print(f"{ratio:}\t{metrics['precision']:.4f}\t{metrics['accuracy']:.4f}\t{metrics['recall']:.4f}\t{metrics['f1']:.4f}")
print("Average F1",round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Accuracy",round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Precision",round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Recall",round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4))

final_results_table.loc[len(final_results_table)] = ['SVM',
                                                     round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4)
                                                    ]

Confusion Matrix for ratio 0.4:
 [[110   7]
 [ 85  35]]
Confusion Matrix for ratio 0.3:
 [[85  5]
 [62 26]]
Confusion Matrix for ratio 0.2:
 [[60  4]
 [33 22]]
Confusion Matrix for ratio 0.1:
 [[27  3]
 [14 16]]

Classification Report:
               precision    recall  f1-score   support

           0       0.66      0.90      0.76        30
           1       0.84      0.53      0.65        30

    accuracy                           0.72        60
   macro avg       0.75      0.72      0.71        60
weighted avg       0.75      0.72      0.71        60

The predicted movement for the next trading day is:Down
Results:
Ratio Precision Accuracy Recall F1
0.4	0.8333	0.6118	0.2917	0.4321
0.3	0.8387	0.6236	0.2955	0.4370
0.2	0.8462	0.6891	0.4000	0.5432
0.1	0.8421	0.7167	0.5333	0.6531
Average F1 0.5163
Average Accuracy 0.6603
Average Precision 0.8401
Average Recall 0.3801


### Algorithm 6: Reinforcement Learning (Q-learning)

In [9]:
# Seed for reproducibility
random.seed(42)

# Loading our stock dataset
file_path = 'baba_stock_data.csv'
data = pd.read_csv(file_path)

# Convert Date to datetime to sort by date
data['Date'] = pd.to_datetime(data['Date'],format='%d-%m-%Y')
data = data.sort_values('Date')

# Feature engineering - creating a new target variable `Profitable` (0/1) based on if the stock price increased or decreased on that day
data['Profitable'] = (data['Close'] > data['Open']).astype(int)

# Select relevant features (excluding 'Adj Close')
X = data[['Open', 'High', 'Low', 'Close', 'Volume']]
y = data['Profitable']
dates = data['Date']

# Reinforcement Learning (Q-learning)
def q_learning_train(X, y, episodes=1000, learning_rate=0.1, discount_factor=0.95, epsilon=0.1):
    n_actions = 2  # Buy or Sell (0 or 1)
    n_states = X.shape[0]
    
    # Initialize Q-table with zeros
    Q = np.zeros((n_states, n_actions))
    
    for _ in range(episodes):
        state = random.randint(0, n_states - 1)
        while True:
            if random.uniform(0, 1) < epsilon: # epsilon is our exploration rate
                action = random.randint(0, n_actions - 1)  # Explore
            else:
                action = np.argmax(Q[state, :])  # Exploit
            
            reward = y.iloc[state] if action == 1 else -y.iloc[state]
            
            next_state = (state + 1) % n_states
            Q[state, action] = Q[state, action] + learning_rate * (reward + discount_factor * np.max(Q[next_state, :]) - Q[state, action])
            
            state = next_state
            if state == 0:
                break
    
    return Q

def q_learning_predict(Q, X):
    '''Predicts the actions for each state in X (test data) using the Q-table.'''
    y_pred = []
    for state in range(X.shape[0]):
        action = np.argmax(Q[state, :])
        y_pred.append(action)
    return np.array(y_pred)

def print_confusion_matrix(cm, title):
    print(f"{title}")
    print(cm,end='\n\n')


# training-testing ratios
ratios = [0.4,0.3,0.2,0.1] 

results = {}

for ratio in ratios:

    # Split the dataset
    X_train, X_test, y_train, y_test, dates_train, dates_test = train_test_split(X, y, dates, test_size=ratio, random_state=42)

    # Standardize the features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    # Train Q-learning model
    Q = q_learning_train(pd.DataFrame(X_train), y_train)
    y_pred_q = q_learning_predict(Q, pd.DataFrame(X_test))

    # Print confusion matrix
    cm = confusion_matrix(y_test, y_pred_q)
    print_confusion_matrix(cm, f"Confusion Matrix for ratio {ratio}")

    # Calculate metrics
    precision_q = precision_score(y_test, y_pred_q)
    accuracy_q = accuracy_score(y_test, y_pred_q)
    recall_q = recall_score(y_test, y_pred_q)
    f1_q = f1_score(y_test, y_pred_q)
    
    results[ratio] = {'precision': precision_q, 'accuracy': accuracy_q, 'recall': recall_q, 'f1': f1_q}

# Print results
print("Results:")
print("Ratio Precision Accuracy Recall F1")
for ratio, metrics in results.items():
    print(f"{ratio:}\t{metrics['precision']:.4f}\t{metrics['accuracy']:.4f}\t{metrics['recall']:.4f}\t{metrics['f1']:.4f}")
print("Average F1",round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Accuracy",round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Precision",round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4))
print("Average Recall",round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4))

final_results_table.loc[len(final_results_table)] = ['RL/Q-Learning', 
                                                     round(sum([ results[ratio]['accuracy'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['precision'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['recall'] for ratio in results.keys() ])/len(results.keys()),4),
                                                     round(sum([ results[ratio]['f1'] for ratio in results.keys() ])/len(results.keys()),4)
                                                    ]

Confusion Matrix for ratio 0.4
[[42 59]
 [69 67]]

Confusion Matrix for ratio 0.3
[[29 47]
 [40 62]]

Confusion Matrix for ratio 0.2
[[17 32]
 [28 42]]

Confusion Matrix for ratio 0.1
[[11 10]
 [12 27]]

Results:
Ratio Precision Accuracy Recall F1
0.4	0.5317	0.4599	0.4926	0.5115
0.3	0.5688	0.5112	0.6078	0.5877
0.2	0.5676	0.4958	0.6000	0.5833
0.1	0.7297	0.6333	0.6923	0.7105
Average F1 0.5982
Average Accuracy 0.5251
Average Precision 0.5995
Average Recall 0.5982


### Our final comparison table tabulating our results

In [10]:
final_results_table

Unnamed: 0,Model,Accuracy,Precision,Recall,F1 Score
0,ANN,0.7743,0.7835,0.7456,0.7637
1,KNN,0.5942,0.6154,0.4688,0.5314
2,Decision Tree,0.6316,0.6417,0.5629,0.5996
3,Random Forest,0.6239,0.6475,0.5275,0.58
4,SVM,0.6603,0.8401,0.3801,0.5163
5,RL/Q-Learning,0.5251,0.5995,0.5982,0.5982
