# Import Libraries/Data Cleaning

In [4]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Deep Learning
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint

# Traditional ML
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.metrics import (
    classification_report, 
    confusion_matrix, 
    accuracy_score, 
    roc_curve, 
    auc
)

# XGBoost
import xgboost as xgb

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# System utilities
from pathlib import Path
import warnings




In [None]:
#access collected data/dataframe
#loremipsum is a placeholder
df = pd.read_csv('loremipsum')

In [None]:
#To check if the recorded data is numerical or an object
#For models, data should all be numerical
df.dtypes

### For Cleaning up the data, we want to ensure any data that we collected is all numerical.
- If there are only 2 inputs in a column use Label Encoding
- If there are more than 2 iputs in a column, use OneHotEncoding
- - OHE results in a series of more columns with 0 or 1 as inputs

In [None]:
#Label Encoding

le = LabelEncoder()

df['colname'] = le.fit_transform(df['colname'])

# Converts the column to numerical values
# Preferably 1 and 0 values

In [None]:
#One Hot Encoding

ohe = OneHotEncoder()

ohencoded_df = pd.DataFrame(ohe.fit_transform(df[['colname', 'colname']]).toarray())

df = pd.concat([df, ohencoded_df], axis = 1)

#After Concating data, you can remove the origina column as to not have the data with varying inputs
#May need to change column names for the encoded items

# Preparation for Models/Neural Network
- Set up 'X' and 'y'
- If needed, utilize scaling/encoding
- Train, Test, and Split

In [None]:
#colname and df are placeholders for actual  data that will be used 
X = df.drop(columns = 'colname')
X[:5]

In [None]:
#colname and df are placeholders
y = df['colname']
y[:5]

In [None]:
#Train, Test, and Split
X_train,  X_test, y_train, y_test  = train_test_split(X, y,  random_state = 60)
#Display the split data Train
display(X_train[:5])
display(y_train[:5])

In [None]:
#Standard Scaler
scale = StandardScaler()

X_train_scaled = scale.fit_transform(X_train)
X_test_scaled = scale.transform(X_test)
display(X_train_scaled[:5])
display(X_test_scaled[:5])

# Gradient Boost + Neural Network

In [None]:
xgbr  = xgb.XGBRegressor(
    objective="reg:squarederror",
    n_estimators = 100,
    max_depth = 6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree = 0.8,
    random_state = 60
)

xgbr.fit(X_train_scaled, y_train)

In [None]:
xgb_train_pred  = xgbr.predict(X_train_scaled)
xgb_test_pred = xgbr.predict(X_test_scaled)


In [None]:
X_train_nn = np.column_stack((X_train_scaled, xgb_train_pred))
X_test_nn = np.column_stack((X_test, xgb_test_pred))

In [None]:
nn_model = Sequential()
nn_model.add(Dense(64, input_dim=X_train_nn.shape[1], activation='relu'))
nn_model.add(Dense(32, activation='relu'))
nn_model.add(Dense(1))

nn_model.compile(optimizer='adam', loss='mean_squared_error')

nn_model.fit(X_train_nn, y_train,epochs=50,batch_size=32, validation_data=(X_test_nn, y_test))


In [None]:
y_pred_nn = nn_model.predict(X_test_nn)

mse_nn = mean_squared_error(y_test,  y_pred_nn)
mse_nn