In [None]:
# !pip install yfinance
# !pip install xgboost

In [None]:
from pandas_datareader import data as pdr
import yfinance as yf

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.preprocessing import LabelEncoder

from xgboost import XGBClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC

In [None]:
# Loading the stock price data into a pandas DataFrame called df
yf.pdr_override()

In [None]:
# downloading Johnson & Johnson (JNJ) data
df = yf.download("JNJ", start="2015-01-01", end="2023-08-31")
df

[*********************100%%**********************]  1 of 1 completed


Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-02,105.050003,105.550003,104.129997,104.519997,82.477386,5753600
2015-01-05,104.480003,104.730003,103.680000,103.790001,81.901344,8079300
2015-01-06,104.339996,104.989998,102.940002,103.279999,81.498886,7428000
2015-01-07,103.910004,105.830002,103.809998,105.559998,83.298050,7931700
2015-01-08,106.059998,106.489998,105.750000,106.389999,83.953011,9916000
...,...,...,...,...,...,...
2023-08-24,164.029999,166.669998,163.860001,165.089996,163.899994,70755100
2023-08-25,164.300003,167.779999,164.059998,166.250000,166.250000,18185500
2023-08-28,165.000000,166.210007,163.169998,164.289993,164.289993,18458000
2023-08-29,164.869995,165.679993,162.770004,164.309998,164.309998,12208600


In [None]:
# viewing the top of df
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-01-02,105.050003,105.550003,104.129997,104.519997,82.477386,5753600
2015-01-05,104.480003,104.730003,103.68,103.790001,81.901344,8079300
2015-01-06,104.339996,104.989998,102.940002,103.279999,81.498886,7428000
2015-01-07,103.910004,105.830002,103.809998,105.559998,83.29805,7931700
2015-01-08,106.059998,106.489998,105.75,106.389999,83.953011,9916000


In [None]:
# view the bottom of the data
df.tail()

Unnamed: 0_level_0,Open,High,Low,Close,Adj Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2023-08-24,164.029999,166.669998,163.860001,165.089996,163.899994,70755100
2023-08-25,164.300003,167.779999,164.059998,166.25,166.25,18185500
2023-08-28,165.0,166.210007,163.169998,164.289993,164.289993,18458000
2023-08-29,164.869995,165.679993,162.770004,164.309998,164.309998,12208600
2023-08-30,165.110001,165.830002,163.679993,163.729996,163.729996,9939100


In [None]:
# shape of df
df.shape

(2180, 6)

In [None]:
# description of df
df.describe()

Unnamed: 0,Open,High,Low,Close,Adj Close,Volume
count,2180.0,2180.0,2180.0,2180.0,2180.0,2180.0
mean,138.819693,139.812344,137.792858,138.835656,124.857064,7809908.0
std,23.81433,24.019585,23.624159,23.818831,28.961496,6381050.0
min,90.699997,91.879997,81.790001,90.730003,73.178062,2114900.0
25%,122.514997,123.217501,121.637501,122.624998,103.216759,5558175.0
50%,138.389999,139.550003,137.395004,138.474998,121.32021,6719850.0
75%,161.23,162.374996,159.972504,161.227501,153.917004,8422850.0
max,185.100006,186.690002,184.179993,186.009995,178.456894,151319500.0


In [None]:
# Pre-processing and cleaning the data

# Dropping rows with NaN values
df.dropna(inplace=True)

In [None]:
# summay of columns with respect to null values
df.isnull().sum()

Open         0
High         0
Low          0
Close        0
Adj Close    0
Volume       0
dtype: int64

In [None]:

# Strategy 1: If next day's close price > today's close price, signal is 'buy' (1), else 'sell' (-1)
df['Signal_1'] = np.where(df['Close'].shift(-1) > df['Close'], 1, -1)

# Strategy 2: Use 50-day MA and 200-day MA to generate signals
short_window = 50
long_window = 200
df['Short_MA'] = df['Close'].rolling(window=short_window, min_periods=1).mean()
df['Long_MA'] = df['Close'].rolling(window=long_window, min_periods=1).mean()
df['Signal_2'] = np.where(df['Short_MA'] > df['Long_MA'], 1, -1)

In [None]:
# Defining Features and Labels

# Using all Columns as features for Strategy 1
features_1 = ['Open', 'High', 'Low', 'Close', 'Volume']
X_1 = df[features_1]

# Using moving averages as features for Strategy 2
features_2 = ['Short_MA', 'Long_MA']
X_2 = df[features_2]
y_1 = df['Signal_1']
y_2 = df['Signal_2']


In [None]:
# Converting the labels to 0 and 1 using LabelEncoder for both strategies
label_encoder_1 = LabelEncoder()
label_encoder_2 = LabelEncoder()
y_1 = label_encoder_1.fit_transform(y_1)
y_2 = label_encoder_2.fit_transform(y_2)


In [None]:
# Splitting data into training and test datasets (80/20 ratio)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(X_1, y_1, test_size=0.2, random_state=42)
X_train_2, X_test_2, y_train_2, y_test_2 = train_test_split(X_2, y_2, test_size=0.2, random_state=42)


In [None]:
# Classifiers
classifiers = {
    'KNN': KNeighborsClassifier(),
    'RF': RandomForestClassifier(random_state=42),
    'GB': GradientBoostingClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'XGB': XGBClassifier(random_state=42)
}

# Dictionary to store evaluation metrics for each strategy
metrics = {
    'Accuracy': [],
    'Precision': [],
    'Recall': [],
    'F1-Score': []
}


In [None]:

# Evaluate classifiers for Strategy 1
for name, clf in classifiers.items():
    clf.fit(X_train_1, y_train_1)
    predictions = clf.predict(X_test_1)
    accuracy = accuracy_score(y_test_1, predictions)
    precision = precision_score(y_test_1, predictions)
    recall = recall_score(y_test_1, predictions)
    f1 = f1_score(y_test_1, predictions)

    metrics['Accuracy'].append(accuracy)
    metrics['Precision'].append(precision)
    metrics['Recall'].append(recall)
    metrics['F1-Score'].append(f1)

# Evaluate classifiers for Strategy 2
for name, clf in classifiers.items():
    clf.fit(X_train_2, y_train_2)
    predictions = clf.predict(X_test_2)
    accuracy = accuracy_score(y_test_2, predictions)
    precision = precision_score(y_test_2, predictions)
    recall = recall_score(y_test_2, predictions)
    f1 = f1_score(y_test_2, predictions)

    metrics['Accuracy'].append(accuracy)
    metrics['Precision'].append(precision)
    metrics['Recall'].append(recall)
    metrics['F1-Score'].append(f1)

# Print the evaluation metrics for both strategies and classifiers
for metric, values in metrics.items():
    print(f'{metric} for Strategy 1:', values[:len(classifiers)])
    print(f'{metric} for Strategy 2:', values[len(classifiers):])
    print('')


Accuracy for Strategy 1: [0.5137614678899083, 0.5114678899082569, 0.5, 0.4701834862385321, 0.4908256880733945]
Accuracy for Strategy 2: [0.9862385321100917, 0.9770642201834863, 0.9747706422018348, 0.9151376146788991, 0.9770642201834863]

Precision for Strategy 1: [0.4875, 0.48325358851674644, 0.4774436090225564, 0.47113163972286376, 0.46226415094339623]
Precision for Strategy 2: [0.982078853046595, 0.9783393501805054, 0.9748201438848921, 0.9576923076923077, 0.9818181818181818]

Recall for Strategy 1: [0.5679611650485437, 0.49029126213592233, 0.616504854368932, 0.9902912621359223, 0.47572815533980584]
Recall for Strategy 2: [0.9963636363636363, 0.9854545454545455, 0.9854545454545455, 0.9054545454545454, 0.9818181818181818]

F1-Score for Strategy 1: [0.5246636771300448, 0.48674698795180726, 0.5381355932203389, 0.6384976525821596, 0.46889952153110054]
F1-Score for Strategy 2: [0.9891696750902528, 0.9818840579710145, 0.9801084990958409, 0.930841121495327, 0.9818181818181818]




Fine Tuning the Random Forest and Gradient Boosting classifiers

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

# Defining the parameter grid for Random Forest
rf_param_grid = {
    'n_estimators': [5, 10, 50],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Defining the parameter grid for Gradient Boosting
gb_param_grid = {
    'n_estimators': [5, 10, 50],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 4, 5],
    'min_samples_split': [2, 5, 10]
}

In [None]:
# For Strategy 1

# Performing grid search for Random Forest Classifier
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search.fit(X_train_1, y_train_1)

# Performing grid search for Gradient Boosting Classifier
gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_param_grid, cv=5)
gb_grid_search.fit(X_train_1, y_train_1)


# Print best parameters and their corresponding accuracy scores
print("Random Forest Best Parameters:", rf_grid_search.best_params_)
print("Random Forest Accuracy:", rf_grid_search.best_score_)

print("Gradient Boosting Best Parameters:", gb_grid_search.best_params_)
print("Gradient Boosting Accuracy:", gb_grid_search.best_score_)

Random Forest Best Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 50}
Random Forest Accuracy: 0.5384332905180648
Gradient Boosting Best Parameters: {'learning_rate': 0.2, 'max_depth': 5, 'min_samples_split': 2, 'n_estimators': 5}
Gradient Boosting Accuracy: 0.5378569311332873


In [None]:
# For Strategy 2

# Performing grid search for Random Forest Classifier
rf_grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_param_grid, cv=5)
rf_grid_search.fit(X_train_2, y_train_2)

# Performing grid search for Gradient Boosting Classifier
gb_grid_search = GridSearchCV(GradientBoostingClassifier(random_state=42), gb_param_grid, cv=5)
gb_grid_search.fit(X_train_2, y_train_2)

# Print best parameters and their corresponding accuracy scores
print("Random Forest Best Parameters:", rf_grid_search.best_params_)
print("Random Forest Accuracy:", rf_grid_search.best_score_)

print("Gradient Boosting Best Parameters:", gb_grid_search.best_params_)
print("Gradient Boosting Accuracy:", gb_grid_search.best_score_)


NameError: ignored