In [1]:
import tensorflow as tf
from tensorflow import keras
import pandas as pd
import import_ipynb
from data_prep import data_pipeline

In [2]:
print(f"TensorFlow Version: {tf.__version__}")

TensorFlow Version: 2.18.0


In [3]:
data_filepath = "./data/data.csv"

In [4]:
df = pd.read_csv(data_filepath)

In [5]:
X_train, X_test, y_train, y_test = data_pipeline(df)

In [6]:
model = tf.keras.models.Sequential([
    tf.keras.layers.Dense(64, input_dim=4, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.BatchNormalization(),
    tf.keras.layers.Dense(1)
])

model.compile(
    optimizer='adam',
    loss='mse',
    metrics=['mae']
)

model.fit(X_train, y_train, epochs=100, batch_size=1, verbose=1)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 304us/step - loss: 11307.0098 - mae: 70.1478
Epoch 2/100
[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 296us/step - loss: 3819.4836 - mae: 52.3504
Epoch 3/100
[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 296us/step - loss: 3782.8611 - mae: 51.8402
Epoch 4/100
[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 295us/step - loss: 3709.3777 - mae: 51.2058
Epoch 5/100
[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 296us/step - loss: 3657.0771 - mae: 50.8403
Epoch 6/100
[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 300us/step - loss: 3585.5996 - mae: 50.0616
Epoch 7/100
[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 302us/step - loss: 3567.9258 - mae: 49.7031
Epoch 8/100
[1m38438/38438[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 302us/step - loss: 3496.1821 - mae: 48.

<keras.src.callbacks.history.History at 0x16a4d0150>

In [17]:
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error

model_svr = SVR(kernel='rbf', C=1, epsilon=0.1)
model_svr.fit(X_train, y_train)
y_pred_svr = model_svr.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_svr)
print(f'Mean Absolute Error (SVR): {mae}')

Mean Absolute Error (SVR): 47.55833895434376


In [21]:
from sklearn.neighbors import KNeighborsRegressor

model_knn = KNeighborsRegressor(n_neighbors=5)
model_knn.fit(X_train, y_train)
y_pred_knn = model_knn.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_knn)
print(f'Mean Absolute Error (KNN): {mae}')

Mean Absolute Error (KNN): 47.603629178440436


In [19]:
from sklearn.linear_model import ElasticNet

model_en = ElasticNet(alpha=0.1, l1_ratio=0.5)
model_en.fit(X_train, y_train)
y_pred_en = model_en.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_en)
print(f'Mean Absolute Error (ElasticNet): {mae}')

Mean Absolute Error (ElasticNet): 47.66374508467676


In [40]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

# Define the parameter grid
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 5, 10],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

# Instantiate the model
rf = RandomForestRegressor()

# Set up GridSearchCV
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, scoring='neg_mean_absolute_error')

# Fit the model
grid_search.fit(X_train, y_train)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print(f"Best Parameters: {best_params}")
print(f"Best Score: {best_score}")

KeyboardInterrupt: 

In [45]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import joblib

# Define the parameter grid (optimized for speed)
param_dist = {
    'n_estimators': np.arange(100, 400, 100),
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2'],
    'bootstrap': [True]
}

# Instantiate the model
rf = RandomForestRegressor()

# Set up RandomizedSearchCV with verbose output
random_search = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=param_dist, 
    n_iter=100,  # Faster search
    cv=3, 
    scoring='neg_mean_absolute_error', 
    random_state=42, 
    n_jobs=-1,  # Use all CPU cores
    verbose=2  # Print each iteration
)

# Fit with checkpointing
try:
    random_search.fit(X_train, y_train)
    joblib.dump(random_search, "random_search_checkpoint.pkl")  # Save progress
    print("\nSearch completed!")
except KeyboardInterrupt:
    print("\nRandomizedSearch stopped manually. Saving progress...")
    joblib.dump(random_search, "random_search_checkpoint.pkl")  # Save on interrupt

# Load previous checkpoint if needed
try:
    random_search = joblib.load("random_search_checkpoint.pkl")
    print(f"\nBest Parameters so far: {random_search.best_params_}")
    print(f"Best Score so far: {random_search.best_score_}")
except:
    print("No checkpoint found or RandomizedSearch did not complete any iterations.")


Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  10.6s
[CV] END bootstrap=True, max_depth=20, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  10.7s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  11.7s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  11.8s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=200; total time=  11.9s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=4, min_samples_split=5, n_estimators=300; total time=  14.1s
[CV] END bootstrap=True, max_depth=None, max_features=sqrt, min_samples_leaf=4, min_samples_split=5, n_estimators



[CV] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=2, min_samples_split=5, n_estimators=200; total time=  10.2s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  10.4s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  10.1s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=1, min_samples_split=2, n_estimators=300; total time=  10.3s
[CV] END bootstrap=True, max_depth=10, max_features=log2, min_samples_leaf=4, min_samples_split=10, n_estimators=100; total time=   3.3s
[CV] END bootstrap=True, max_depth=None, max_features=log2, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   5.1s
[CV] END bootstrap=True, max_depth=20, max_features=log2, min_samples_leaf=4, min_samples_split=2, n_estimators=300; total time=  13.7s
[CV] END bootstrap=True, max_depth=20, max_fe

In [46]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Train with optimal parameters
model_rf = RandomForestRegressor(
    n_estimators=300,
    min_samples_split=2,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=20,
    bootstrap=True
)

model_rf.fit(X_train, y_train)
y_pred_rf = model_rf.predict(X_test)
mae = mean_absolute_error(y_test, y_pred_rf)
print(f'Mean Absolute Error (Random Forest): {mae}')


Mean Absolute Error (Random Forest): 41.479121244863684


In [47]:
joblib.dump(model_rf, 'random_forest_model.pkl')

['random_forest_model.pkl']

In [48]:
loaded_model = joblib.load('random_forest_model.pkl')
y_pred = loaded_model.predict(X_test)

In [49]:
mae = mean_absolute_error(y_test, y_pred)
mae

41.479121244863684