In [71]:
import pandas as pd
import numpy as np

# Pre-processing.
from sklearn.preprocessing import StandardScaler

# Dummy.
from sklearn.dummy import DummyRegressor

# Models.
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Metrics.
from sklearn.metrics import make_scorer
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Evaluating.
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV


# Step 1: Load data


In [72]:
training_data = pd.read_parquet('./assets/training_set_v2.parquet')
test_data = pd.read_parquet('./assets/test_set_v2.parquet')
validation_data = pd.read_parquet('./assets/validation_set_v2.parquet')


# Step 2: Standardize data


In [73]:
# Make sure y isn't in X.
columns_to_drop = ['events', 'ItemKey', 'RWB_EFFECTIVE_DATE']
X_train = training_data.drop(columns=columns_to_drop, axis=1)
X_test = test_data.drop(columns=columns_to_drop, axis=1)
X_val = validation_data.drop(columns=columns_to_drop, axis=1)

In [74]:
# Standardize values within each column to have a mean=0 and std=1.
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_test_std = scaler.transform(X_test)
X_val_std = scaler.transform(X_val)

In [75]:
# Do not standardize y.
y_train = training_data['events']
y_test = test_data['events']
y_val = validation_data['events']


# Step 3: Instantiate dummy regressors


In [76]:
dummy_regressor_mean = DummyRegressor(strategy='mean')
dummy_regressor_median = DummyRegressor(strategy='median')
dummy_regressor_quantile = DummyRegressor(strategy='quantile', quantile=0.25)


# Step 4: Define models


In [77]:
models = {
    'Dummy Mean': dummy_regressor_mean,
    'Dummy Median': dummy_regressor_median,
    'Dummy Quantile': dummy_regressor_quantile,
    'Linear Regression': LinearRegression(),
    'Lasso Regression': Lasso(),
    'Ridge Regression': Ridge(),
    'Elastic Net Regression': ElasticNet(),
    'Decision Tree Regression': DecisionTreeRegressor(),
    'Random Forest Regression': RandomForestRegressor(),
    'Gradient Boosting Regression': GradientBoostingRegressor()
}


# Step 5: Evaluate each model


In [78]:
test_results = []
for model_name, model in models.items():
    model.fit(X_train_std, y_train)
    predictions_test = model.predict(X_test_std)

    mse_test = mean_squared_error(y_test, predictions_test)
    rmse_test = np.sqrt(mse_test)
    mae_test = mean_absolute_error(y_test, predictions_test)

    test_results.append([model_name, mse_test, rmse_test, mae_test])

    # if model_name in ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']:
    #     if hasattr(model, 'coef_'):
    #         feature_importances = model.coef_
    #         sorted_indices = np.argsort(np.abs(feature_importances))[::-1][:5]
    #     elif hasattr(model, 'feature_importances_'):
    #         feature_importances = model.feature_importances_
    #         sorted_indices = np.argsort(feature_importances)[::-1][:5]
    #     else:
    #         sorted_indices = None

    #     if sorted_indices is not None:
    #         print(f'Feature importances for {model_name}:')
    #         for idx in sorted_indices:
    #             feature_name = X_train.columns[idx]
    #             importance = feature_importances[idx]
    #             print(f'{feature_name}: {importance}')


# Step 6: Load results into a DataFrame


In [79]:
def bold_below_threshold(val):
    if val <= 1.41:
        return 'font-weight: bold'
    else:
        return ''

test_metrics_df = pd.DataFrame(
    test_results,
    columns=['Model', 'Test MSE', 'Test RMSE', 'Test MAE']
)
test_metrics_df.set_index('Model').style.format(precision=2).applymap(bold_below_threshold)
# Not as good as Dummy Quantile, but superior to Dummy Median.
# Almost in between the two.
# MAEs much closer to Quantile vs. MSEs.

Unnamed: 0_level_0,Test MSE,Test RMSE,Test MAE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Dummy Mean,3.68,1.92,1.81
Dummy Median,2.38,1.54,1.4
Dummy Quantile,1.1,1.05,0.78
Linear Regression,7261170719898101.0,85212503.31,563581.62
Lasso Regression,3.68,1.92,1.81
Ridge Regression,2.81,1.67,1.44
Elastic Net Regression,3.66,1.91,1.8
Decision Tree Regression,6.08,2.47,1.14
Random Forest Regression,2.37,1.54,1.12
Gradient Boosting Regression,2.42,1.55,1.34


In [80]:
# # Include 5 fold cross val
# test_results2 = []
# for model_name, model in models.items():
#     cv_scores = cross_val_score(model, X_train_std, training_data['events'], scoring='neg_mean_squared_error', cv=5)
#     mse_cv = -np.mean(cv_scores)
#     rmse_cv = np.sqrt(mse_cv)
#     mae_cv = np.mean(cross_val_score(model, X_train_std, training_data['events'], scoring='neg_mean_absolute_error', cv=5))

#     model.fit(X_train_std, training_data['events'])
#     predictions_test = model.predict(X_test_std)
#     mse_test = mean_squared_error(test_data['events'], predictions_test)
#     rmse_test = np.sqrt(mse_test)
#     mae_test = mean_absolute_error(test_data['events'], predictions_test)

#     # if model_name in ['Linear Regression', 'Lasso Regression', 'Ridge Regression', 'Elastic Net Regression', 'Decision Tree Regression', 'Random Forest Regression', 'Gradient Boosting Regression']:
#     #     if hasattr(model, 'coef_'):  
#     #         feature_importances = model.coef_
#     #         sorted_indices = np.argsort(np.abs(feature_importances))[::-1][:5]  
#     #     elif hasattr(model, 'feature_importances_'):  
#     #         feature_importances = model.feature_importances_
#     #         sorted_indices = np.argsort(feature_importances)[::-1][:5]
#     #     else:
#     #         sorted_indices = None

#     #     if sorted_indices is not None:
#     #         print(f'Feature importances for {model_name}:')
#     #         for idx in sorted_indices:
#     #             feature_name = X_train.columns[idx]
#     #             importance = feature_importances[idx]
#     #             print(f'{feature_name}: {importance}')

#     test_results2.append([model_name, mse_test, rmse_test, mae_test, mse_cv, rmse_cv, mae_cv])

# test_metrics2_df = pd.DataFrame(test_results2, columns=['Model', 'Test MSE', 'Test RMSE', 'Test MAE', 'CV MSE', 'CV RMSE', 'CV MAE'])
# test_metrics2_df


___
# Hyperparameter tuning



# Original


In [81]:
# # Define hyperparameter grid for grid search
# param_grid = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

In [82]:
# # Define hyperparameter distributions for random search
# param_dist = {
#     'n_estimators': [50, 100, 200],
#     'max_depth': [None, 10, 20],
#     'min_samples_split': [2, 5, 10],
#     'min_samples_leaf': [1, 2, 4]
# }

In [83]:
# Split data into training and validation sets
# RJ 08/12: Cannot split on a column that has already received pre-processing: Data leakage.
# X_train, X_val, y_train, y_val = train_test_split(
#     X_train_std,
#     training_data['events'],
#     test_size=0.2,
#     random_state=42
# )

In [84]:
# # Initialize Random Forest model.
# rf_model = RandomForestRegressor(random_state=42)

In [85]:
# # Random Search
# random_search = RandomizedSearchCV(
#     rf_model,
#     param_distributions=param_dist,
#     n_iter=10,
#     scoring='neg_mean_squared_error',
#     cv=5,
#     random_state=42
# )
# random_search.fit(X_train, y_train)
# best_params_random = random_search.best_params_
# best_rf_random = random_search.best_estimator_

In [86]:
# # Grid Search
# grid_search = GridSearchCV(
#     rf_model,
#     param_grid,
#     scoring='neg_mean_squared_error',
#     cv=5
# )
# grid_search.fit(X_train, y_train)
# best_params_grid = grid_search.best_params_
# best_rf_grid = grid_search.best_estimator_

In [87]:
# # Evaluate best models on validation data
# y_val_pred_grid = best_rf_grid.predict(X_val)
# mse_val_grid = mean_squared_error(y_val, y_val_pred_grid)
#
# y_val_pred_random = best_rf_random.predict(X_val)
# mse_val_random = mean_squared_error(y_val, y_val_pred_random)
#
# print(f'Grid Search - Best Hyperparameters: {best_params_grid}, Validation MSE: {mse_val_grid}')
# print(f'Random Search - Best Hyperparameters: {best_params_random}, Validation MSE: {mse_val_random}')


# New


In [88]:
# List of selected features
# selected_features = [
#     'Days Since Creation',
#     'Days Since Last Logon',
#     'BIOSReleaseAge',
#     'LastBootAge',
#     'avg_software_age',
#     'FreeSpace_GB',
#     'num_installed_programs',
#     'Outlookx86_addin_filesize',
#     'Outlookx64_addin_filesize',
#     'Excelx86_addin_filesize',
#     'PowerPointx86_addin_filesize',
#     'Wordx64_addin_filesize',
#     'has_cap_iq_add',
#     'has_factset_add',
#     'InstallAge',
#     'num_users',
#     'num_updates',
#     'Total RAM'
# ]

selected_features = [
    'Days Since Creation',
    'avg_software_age',
    'FreeSpace_GB',
    'Outlookx86_addin_filesize',
    'Wordx64_addin_filesize',
    'Days Since Last Logon',
    'num_installed_programs',
    'Outlookx64_addin_filesize',
    'InstallAge',
    'LastBootAge',
    'Excelx86_addin_filesize',
    'has_cap_iq_add'
]

In [89]:
# Create trimmed datasets
X_train_trimmed = X_train[selected_features]
X_test_trimmed = X_test[selected_features]
X_val_trimmed = X_val[selected_features]

In [90]:
print(X_train_trimmed.shape)
print(X_test_trimmed.shape)
print(X_val_trimmed.shape)

(78592, 12)
(45722, 12)
(45723, 12)


In [91]:
# def rmse_scorer(y_true, y_pred):
#     """Scores by Root Mean Square Error for cross_val_score()"""
#     mse = mean_squared_error(y_true, y_pred)
#     return np.sqrt(mse)


# Define the custom scoring function
def weighted_mae_fun(y_true, y_pred):
    """Scores by WMAE for cross_val_score()

    # Errors for 0 num events are 0.5 times as important.
    # Errors for 1 num events are 1 times as important.
    # Errors for 2 or more num events are 3 times as important.
    """
    errors = np.abs(y_true - y_pred)
    sample_weights = np.where(y_true == 0, 0.5, np.where(y_true == 1, 1, 3))
    weighted_errors = sample_weights * errors
    weighted_mae_score = np.sum(weighted_errors) / np.sum(sample_weights)

    return np.mean(weighted_mae_score)


#### RF: RandomizedSearchCV


In [92]:
# Use the RandomizedSearchCV method on a RandomForestRegressor model to identify a subset of parameters which could be optimal.
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start=200, stop=600, num=5)]
max_features = ['sqrt', 'log2']  # Number of features to consider at every split

max_depth = [int(x) for x in np.linspace(50, 300, num=6)]  # Maximum number of levels in tree
max_depth.append(None) # Add None.

min_samples_split = [2, 3, 5, 10, 20, 40]  # Minimum number of samples required to split a node
min_samples_leaf = [1, 3, 5, 10, 20, 40, 60, 80]  # Minimum number of samples required at each leaf node
bootstrap = [True, False]  # Method of selecting samples for training each tree

# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
for k, v in random_grid.items():
    print(k, v)

n_estimators [200, 300, 400, 500, 600]
max_features ['sqrt', 'log2']
max_depth [50, 100, 150, 200, 250, 300, None]
min_samples_split [2, 3, 5, 10, 20, 40]
min_samples_leaf [1, 3, 5, 10, 20, 40, 60, 80]
bootstrap [True, False]


In [93]:
def evaluate_wmae(model, x, y):
    """
    Extends weighted_mae_fun out one level by asking the model to create
    predictions off x vs. handing them directly to weighted_mae_fun as
    y_pred.
    """
    preds = model.predict(x)
    weighted_errors = weighted_mae_fun(y, preds)
    print('Weighted Mean Absolute Error: {:0.2f}.'.format(weighted_errors))
    return weighted_errors

In [94]:
base_rf = RandomForestRegressor(
    n_estimators=10,
    criterion='poisson',
    # random_state=42
)
base_rf.fit(X_train_trimmed, y_train)
base_accuracy = evaluate_wmae(base_rf, X_val_trimmed, y_val)

Weighted Mean Absolute Error: 1.45.


In [None]:
rf = RandomForestRegressor(criterion='poisson') # First create the base model to tune.
# Random search of parameters, using 3-fold cross validation.
rf_random = RandomizedSearchCV(
    estimator=rf,
    param_distributions=random_grid,
    n_iter=100, # Search across n_iter different random combinations.
    cv=3,
    scoring=make_scorer(weighted_mae_fun, greater_is_better=False),
    random_state=42,
    n_jobs=-1, # Use all available cores.
    verbose=2,
)
rf_random.fit(X_train_trimmed, y_train) # Fit the random search model
rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits
[CV] END bootstrap=False, max_depth=50, max_features=sqrt, min_samples_leaf=60, min_samples_split=5, n_estimators=300; total time= 1.7min
[CV] END bootstrap=False, max_depth=150, max_features=sqrt, min_samples_leaf=20, min_samples_split=2, n_estimators=400; total time= 3.4min
[CV] END bootstrap=False, max_depth=None, max_features=sqrt, min_samples_leaf=5, min_samples_split=2, n_estimators=400; total time= 3.8min




In [26]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate_wmae(best_random, X_val_trimmed, y_val)

Weighted Mean Absolute Error: 1.12.


In [None]:
# WMAE: 1.12 from n_iter=100:
# {'n_estimators': 400,
#  'min_samples_split': 2,
#  'min_samples_leaf': 5,
#  'max_features': 'log2',
#  'max_depth': 250,
#  'bootstrap': False}


#### RF: GridSearchCV


In [95]:
param_grid = rf_random.best_params_.copy()
param_grid

{'n_estimators': 400,
 'min_samples_split': 2,
 'min_samples_leaf': 5,
 'max_features': 'log2',
 'max_depth': 250,
 'bootstrap': False}

In [96]:
# RJ 08/12: This took 8 mins. to run
# param_grid_rf = {
#     'n_estimators': [100, 200, 300],
#     'max_depth': [5, 10, 20],
#     'min_samples_split': [2, 5, 10]
# }

In [97]:
def build_params(begin, end, amount, var):
    """
    'min_samples_split' must be greater than 1.
    'min_samples_leaf' must be greater than 0.

    :param begin: Place at which to begin.
    :param end: Place at which to stop.
    :param amount: By how much to shift for each iteration.
    :param var: Which variable within the param_grid to build values for.
    :return: A list of new values to enter into a dictionary that is passed to grid search.
    """
    if param_grid[var] == 1:
        return [1, 2, 3]
    elif var == 'min_samples_split' and param_grid[var] < 2:
        return [2, 3, 4]
    elif var == 'max_depth' and param_grid[var] is None:
        return [None]
    elif var == 'max_depth' and param_grid[var] < 30:
        return [10, 20, 30, 40, 50]
    else:
        new_list = [param_grid[var] - (amount * i) for i in range(begin, 0, -1)]
        new_list.append(param_grid[var])
        new_list.extend(param_grid[var] + (amount * i) for i in range(1, end))
        return new_list

# Take what RandomizedSearchCV found and expand the space around those variables to pass to GridSearchCV.
param_grid.update({
    'n_estimators': build_params(1, 2, 50, var='n_estimators'),
    'min_samples_split': build_params(0, 3, 3, var='min_samples_split'),
    'min_samples_leaf': build_params(0, 3, 3, var='min_samples_leaf'),
    'max_features': [param_grid['max_features']],
    'max_depth': build_params(1, 2, 10, var='max_depth'),
    'bootstrap': [param_grid['bootstrap']]
})

# param_grid['n_estimators'].extend([int(param_grid['n_estimators'][-1] * 1.5)]) # Add a relatively large value.
# param_grid['max_depth'].append(None)
param_grid

{'n_estimators': [350, 400, 450],
 'min_samples_split': [2, 5, 8],
 'min_samples_leaf': [5, 8, 11],
 'max_features': ['log2'],
 'max_depth': [240, 250, 260],
 'bootstrap': [False]}

In [70]:
# 81 fits @ n_estimators=400 = 8m.
base_rf_regressor = RandomForestRegressor(criterion='poisson')
grid_search_rf = GridSearchCV(
    base_rf_regressor,
    param_grid,
    scoring=make_scorer(weighted_mae_fun, greater_is_better=False),
    cv=3,
    n_jobs=-1,
    verbose=2
)
grid_search_rf.fit(X_train_trimmed, y_train)
best_rf_model_tuned = grid_search_rf.best_estimator_
best_rf_model_tuned

Fitting 3 folds for each of 81 candidates, totalling 243 fits


KeyboardInterrupt: 

In [69]:
gridsearch_accuracy = evaluate_wmae(best_rf_model_tuned, X_val_trimmed, y_val)

Weighted Mean Absolute Error: 1.12.



#### GB


In [None]:
# param_grid_gb = {
#     'n_estimators': [100, 200, 300],
#     'learning_rate': [0.01, 0.1, 0.2],
#     'max_depth': [3, 4, 5]
# }

In [None]:
# base_gbr_regressor = GradientBoostingRegressor()
# grid_search_gb = GridSearchCV(
#     base_gbr_regressor,
#     param_grid_gb,
#     scoring=make_scorer(weighted_mae_fun),
#     cv=3,
#     n_jobs=-1
# )
# grid_search_gb.fit(X_train_trimmed, y_train)
# best_gb_model_tuned = grid_search_gb.best_estimator_
# best_gb_model_tuned


#### Results


In [54]:
predictions_rf_tuned = best_rf_model_tuned.predict(X_val_trimmed)
# predictions_gb_tuned = best_gb_model_tuned.predict(X_val_trimmed)

In [55]:
# RF
mse_rf_tuned = mean_squared_error(y_val, predictions_rf_tuned)
rmse_rf_tuned = np.sqrt(mse_rf_tuned)
mae_rf_tuned = mean_absolute_error(y_val, predictions_rf_tuned)

ValueError: Found input variables with inconsistent numbers of samples: [45722, 45723]

In [None]:
# GB
# mse_gb_tuned = mean_squared_error(y_val, predictions_gb_tuned)
# rmse_gb_tuned = np.sqrt(mse_gb_tuned)
# mae_gb_tuned = mean_absolute_error(y_val, predictions_gb_tuned)

In [None]:
print("Random Forest Regression (Tuned) on Trimmed Dataset:")
print(f"RMSE: {rmse_rf_tuned}")
print(f"MAE: {mae_rf_tuned}")

In [None]:
# print("\nGradient Boosting Regression (Tuned) on Trimmed Dataset:")
# print(f"RMSE: {rmse_gb_tuned}")
# print(f"MAE: {mae_gb_tuned}")