# DSO 530 Project: Final Prediction
Group 49: Jessica Bratahani, Pin Hsuan Chang, Suhan Ho, Sheena Huang, Yunchi Lee

In [1]:
import numpy as np
import pandas as pd
import time
import itertools
import matplotlib.pyplot as plt
import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier

from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.metrics import accuracy_score

import warnings
warnings.filterwarnings('ignore')

## Training and Test Data

In [2]:
df_train=pd.read_csv('option_train.csv',index_col=0)

#Drop Duplicates if Any:
df_train = df_train.dropna()

X_train = df_train[['S','K','tau','r']]
y_train = df_train['Value']

In [3]:
df_test=pd.read_csv('option_test_nolabel.csv',index_col=0)

In [4]:
df_test.head()

Unnamed: 0,S,K,tau,r
1,1409.28,1325,0.126027,0.0115
2,1505.97,1100,0.315068,0.011
3,1409.57,1450,0.19726,0.0116
4,1407.81,1250,0.10137,0.0116
5,1494.5,1300,0.194521,0.011


## Regression - Best Model: Random Forest Regressor

In [5]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(df_train[['S','K','tau','r']])
X_test_scaled = scaler.transform(df_test[['S','K','tau','r']])

In [6]:
def kfolds_cv_regression(regressor, kfolds, x_train, y_train):
    kfolds_regression = KFold(n_splits = kfolds, random_state = 1, shuffle = True)
    r2_model_1_cv = cross_val_score(regressor, x_train, y_train, cv=kfolds_regression)    
    neg_mse_model_1_cv = cross_val_score(regressor, x_train, y_train, cv=kfolds_regression,scoring = 'neg_mean_squared_error')
    return np.mean(r2_model_1_cv), -np.mean(neg_mse_model_1_cv)

In [7]:
regressor=RandomForestRegressor(random_state=0, oob_score=True)
KFolds=5
mean_r2, mean_mse=kfolds_cv_regression(regressor, KFolds, X_train_scaled, y_train)
print('Model:', type(regressor).__name__, 'KFolds:',KFolds,' Mean R Squared:', mean_r2,' Mean MSE:', mean_mse)

Model: RandomForestRegressor KFolds: 5  Mean R Squared: 0.9965151303907671  Mean MSE: 54.397492430605475


In [8]:
final_regressor=RandomForestRegressor(random_state=0, oob_score=True)

# fit the final regressor with X_train_scaled and Y_train data 
final_regressor.fit(X_train_scaled, y_train)

# Predicting the target values of the test set
y_pred = final_regressor.predict(X_test_scaled)

In [9]:
# Create new dataframe to store our predicted values
df_prediction = pd.DataFrame(y_pred, columns=['Value'])
df_prediction.index = range(1, len(df_prediction)+1)

## Classification - Best Model: Random Forest Regressor

In [10]:
df_train.head()

Unnamed: 0,Value,S,K,tau,r,BS
1,348.5,1394.46,1050,0.128767,0.0116,Under
2,149.375,1432.25,1400,0.679452,0.0113,Under
3,294.5,1478.9,1225,0.443836,0.0112,Under
4,3.375,1369.89,1500,0.117808,0.0119,Over
5,84.0,1366.42,1350,0.29863,0.0119,Under


In [11]:
df_train['BS_class'] = df_train['BS'].map({'Under': 0, 'Over': 1})
y_train_BS = df_train['BS_class']

In [12]:
# Initialize the StratifiedKFold object
kfolds = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
accuracies_rd = []

# Cross-validation loop
for train_index, test_index in kfolds.split(X_train, y_train_BS):
    clf_rf = RandomForestClassifier(random_state=1, n_estimators=200)
    clf_rf.fit(X_train.iloc[train_index], y_train_BS.iloc[train_index])  # Train on the fold's training part
    y_pred_rf = clf_rf.predict(X_train.iloc[test_index])  # Predict on the fold's testing part
    score = accuracy_score(y_train_BS.iloc[test_index], y_pred_rf)  # Calculate accuracy
    accuracies_rd.append(score)

# Calculate mean and standard deviation of accuracies
mean_accuracy_rf = sum(accuracies_rd) / len(accuracies_rd)
std_accuracy_rf = np.std(accuracies_rd)

print(f"Random Forest mean accuracy: {mean_accuracy_rf:.4f}")
print(f"Random Forest standard deviation of accuracy: {std_accuracy_rf:.4f}")

Random Forest mean accuracy: 0.9366
Random Forest standard deviation of accuracy: 0.0096


In [13]:
final_classifier=RandomForestClassifier(random_state=1, n_estimators=200)

# fit the final regressor with X_train and Y_train data 
final_classifier.fit(X_train, y_train_BS)

X_test = df_test[['S','K','tau','r']]
# Predicting the target values of the test set
y_pred_BS = final_classifier.predict(X_test)

In [14]:
df_prediction['BS'] = y_pred_BS.tolist()

In [15]:
df_prediction.sample(5)

Unnamed: 0,Value,BS
494,50.96,1
101,97.50375,0
18,45.57625,0
246,25.13125,0
60,28.88,0


In [16]:
df_prediction['BS'].value_counts(normalize=True)

BS
0    0.788
1    0.212
Name: proportion, dtype: float64

In [17]:
df_train['BS_class'].value_counts(normalize=True)

BS_class
0    0.7736
1    0.2264
Name: proportion, dtype: float64

In [18]:
# save file to .csv for submission
df_prediction.to_csv('group_49_prediction.csv',index = False)