In [None]:
!pip install vecstack

from vecstack import stacking
import pandas as pd
import numpy as np


from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings("ignore")

from google.colab import drive
drive.mount('/gdrive')
#Change current working directory to gdrive
%cd /gdrive



Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [None]:
trainfile = r'/gdrive/My Drive/Preprocess_Train_Assignment4.csv'
trainData = pd.read_csv(trainfile) #creates a dataframe
testfile = r'/gdrive/My Drive/Preprocess_Test_Assignment4.csv'
testData = pd.read_csv(testfile)  #creates a dataframe


print(trainData.shape)
print(testData.shape)



(137, 44)
(100000, 43)


In [None]:
#Extract training and test data
y_train = trainData["revenue"]
X_train = trainData.drop(["revenue"], axis=1) #extracting training data without the target column
print(X_train.shape)
print(testData.shape)


(137, 43)
(100000, 43)


In [None]:
# Assuming you have trained your Decision Tree model and made predictions
clf = DecisionTreeRegressor()
clf.fit(X_train, y_train)

# Extract feature columns used during training
feature_columns = X_train.columns

# Extract test data
# (Assuming 'revenue' is not present in the test set)
X_test = testData

# Preprocess the date column for test data
X_test['Open Date'] = pd.to_datetime(X_test['Open Date'])
X_test['year'] = X_test['Open Date'].dt.year
X_test['month'] = X_test['Open Date'].dt.month
X_test['day'] = X_test['Open Date'].dt.day

# Drop the original date column and other unnecessary columns
X_test = X_test.drop(['Open Date'], axis=1)

# One-hot encode all non-numeric columns
X_test = pd.get_dummies(X_test, columns=X_test.select_dtypes(include=['object']).columns)

# Ensure columns in X_test match columns in X_train
missing_columns = set(feature_columns) - set(X_test.columns)
for col in missing_columns:
    X_test[col] = 0

# Reorder columns in X_test to match the order in X_train
X_test = X_test[feature_columns]

# Make predictions on the test set
clf_predict_Test = clf.predict(X_test)

# Save predictions
df_DT = pd.DataFrame()
df_DT['Prediction'] = clf_predict_Test

# Export the DataFrame to a CSV file
export_csv = df_DT.to_csv(r'/gdrive/My Drive/DT_Test17.csv')





In [None]:
# Assuming you have trained your Random Forest model and made predictions
rfc = RandomForestRegressor()
rfc.fit(X_train, y_train)
rfc_predict_Train = rfc.predict(X_train)

train_rmse = mean_squared_error(y_train, rfc_predict_Train)
print("RMSE (training) for Random Forest: {:.6f}".format(train_rmse))

# Make predictions on the test set
rfc_predict_Test = rfc.predict(X_test)

# Save predictions
df_RF = pd.DataFrame()
df_RF['Prediction'] = rfc_predict_Test

# Export the DataFrame to a CSV file
export_csv = df_RF.to_csv(r'/gdrive/My Drive/RF_Test18.csv', index=False)



RMSE (training) for Random Forest: 1020363853565.604126


In [None]:
#Save predictions
df_rfc=pd.DataFrame()
df_rfc['Predictions']=rfc_predict_Test
export_csv = df_rfc.to_csv(r'/gdrive/My Drive/RF_Test19.csv')

In [None]:
#Gradient Boosting Regressor================================================================================

abc = GradientBoostingRegressor()
abc.fit(X_train, y_train)
abc_predict_Train = abc.predict(X_train)

train_rmse = mean_squared_error(y_train, abc_predict_Train)
print("RMSE (training) for Gradient Boosting: {:.6f}".format(train_rmse))

# Make predictions on the test set
abc_predict_Test = abc.predict(X_test)

# Save predictions
df_ABC = pd.DataFrame()
df_ABC['Predictions'] = abc_predict_Test

# Export the DataFrame to a CSV file
export_csv = df_ABC.to_csv(r'/gdrive/My Drive/ABC_Test17.csv', index=False)



RMSE (training) for Gradient Boosting: 211059290151.413116


In [None]:
#Save predictions
df_abc=pd.DataFrame()
df_abc['Predictions']=abc_predict_Test
export_csv = df_abc.to_csv(r'/gdrive/My Drive/GB_Test17.csv')

In [None]:
# STACKING MODELS =====================================================================
print("___________________________________________________________________________________________\nEnsemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier\n")

models = [GradientBoostingRegressor(), RandomForestRegressor(), DecisionTreeRegressor()]

S_Train, S_Test = stacking(models,
                           X_train, y_train, X_test,
                           regression=True,
                           mode='oof_pred_bag',
                           needs_proba=False,
                           save_dir=None,
                           n_folds=4,
                           shuffle=True,  # Set shuffle to True
                           random_state=42,  # Set a random_state value
                           verbose=2)


___________________________________________________________________________________________
Ensemble Methods Predictions using GradientBoosting, RandomForest and Decision Tree Classifier

task:         [regression]
metric:       [mean_absolute_error]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [GradientBoostingRegressor]
    fold  0:  [1727571.80164451]
    fold  1:  [1878983.07222684]
    fold  2:  [1670193.52098828]
    fold  3:  [1867496.39775227]
    ----
    MEAN:     [1786061.19815298] + [89599.79705703]
    FULL:     [1785634.26825146]

model  1:     [RandomForestRegressor]
    fold  0:  [1617860.18714286]
    fold  1:  [1643741.91705882]
    fold  2:  [1695464.07823529]
    fold  3:  [2019925.07235294]
    ----
    MEAN:     [1744247.81369748] + [161596.00179977]
    FULL:     [1743325.27627737]

model  2:     [DecisionTreeRegressor]
    fold  0:  [1982701.68571429]
    fold  1:  [2370514.73529412]
    fold  2:  [2094164.91176471]
    fold  3:  [2062006.735294

In [None]:
#STACKING - CONTRUCT A GRADIENT BOOSTING MODEL==============================
model = GradientBoostingRegressor()

model = model.fit(S_Train, y_train)
y_pred_train = model.predict(S_Train)
y_pred_test = model.predict(S_Test)



In [None]:
mean_squared_error(y_train,y_pred_train)
print("RMSE (training) for Decision Tree:{0:10f}".format(mean_squared_error(y_train,y_pred_train)))
#mean_squared_error(y_test,y_pred_test)
#print("RMSE (Test Data) for Decision Tree:{0:10f}".format(mean_squared_error(y_test,y_pred_test)))

RMSE (training) for Decision Tree:399110582712.695801


In [None]:
#Save predictions
df_stacking=pd.DataFrame()
df_stacking['Prediction']=y_pred_test
export_csv = df_rfc.to_csv(r'/gdrive/My Drive/Stacking_Test17.csv')

In [None]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the base models
models = [
    ('gb', GradientBoostingRegressor()),
    ('rf', RandomForestRegressor()),
    ('dt', DecisionTreeRegressor())
]

# Define the meta-model
meta_model = LinearRegression()

# Create the stacked model
stacked_model = StackingRegressor(estimators=models, final_estimator=meta_model)

# Define parameters for grid search
param_grid = {
    'gb__n_estimators': [50, 100, 200],
    'gb__learning_rate': [0.01, 0.1, 0.2],
    'gb__max_depth': [3, 5, 7],
    'rf__n_estimators': [50, 100, 200],
    'rf__max_depth': [None, 10, 20],
    'rf__min_samples_split': [2, 5, 10],
    'dt__max_depth': [None, 5, 10],
    'dt__min_samples_split': [2, 5, 10]
}

# Perform hyperparameter tuning using GridSearchCV
grid_search = GridSearchCV(stacked_model, param_grid, scoring=make_scorer(mean_squared_error, greater_is_better=False), cv=4)
grid_search.fit(X_train, y_train)

# Print the best parameters and best score
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {grid_search.best_score_}")

# Make predictions on the test set
stacked_predict_Test = grid_search.predict(X_test)

# Save predictions
df_stacked = pd.DataFrame({'Prediction': stacked_predict_Test})

# Export the DataFrame to a CSV file
export_csv = df_stacked.to_csv(r'/gdrive/My Drive/Stacked_Model_Predictions.csv', index=False)

