In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
!pip install category_encoders

Installing collected packages: category_encoders
Successfully installed category_encoders-2.6.2


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from category_encoders import TargetEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score, mean_absolute_error
import joblib

In [None]:
# Change the file path to your input file
file_path = '/content/drive/My Drive/6301.501/df1WithTimeBin.csv'
# column names required from input file
columns_required = ['Day1 of the Week', 'Time Bin', 'Zip Code', 'Division', 'Sector', 'Incident_Score']
df = pd.read_csv(file_path, usecols=columns_required)
# columns from df that are required to be considered as features
columns_considered = ['Day1 of the Week', 'Time Bin', 'Zip Code', 'Division', 'Sector']

X = df[columns_considered]
y = df['Incident_Score']

In [None]:
print(y.head(),"\n", X.head())

In [None]:
# pipeline for 4 encoding types is created. comment the encoding types not required
encodings = {
    'one_hot': ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), columns_considered)]),
    'label': 'label',
    'target': 'target',
    'frequency':'frequency'
}

#Onehotencoding (ignore this section)


In [None]:
# preprocessor = ColumnTransformer(transformers=[('cat', OneHotEncoder(handle_unknown='ignore'), Colums_considered)])

# X_transformed = preprocessor.fit_transform(df[Colums_considered])

# X_transformed_df = pd.DataFrame(X_transformed.toarray(), columns=preprocessor.named_transformers_['cat'].get_feature_names_out(Colums_considered))

# X_train, X_test, y_train, y_test = train_test_split(X_transformed_df, y, test_size=0.2, random_state=42)


#Label Encoding (ignore this section)

In [None]:
# from sklearn.preprocessing import LabelEncoder

# label_encoders = {col: LabelEncoder() for col in Colums_considered}

# # Fit and transform the data to label encoding
# for col, encoder in label_encoders.items():
#     df[col] = encoder.fit_transform(df[col])

# X = df[Colums_considered]
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# print(X_train.head())
# print(y_test.head())
# print(y_train.head())

#Target Encoding (ignore this section)

In [None]:
# from category_encoders import TargetEncoder
# from sklearn.compose import ColumnTransformer

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# target_encoder = TargetEncoder(cols=Colums_considered)

# # Fit the encoder on the training data and transform both the training and test data
# X_train_encoded = target_encoder.fit_transform(X_train, y_train)
# X_test_encoded = target_encoder.transform(X_test)

# X_train = X_train_encoded
# X_test = X_test_encoded

# print(X_train.head())

#Frequency/Count Encoding (ignore this section)

In [None]:
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# def frequency_encoding(df, column):
#     freq = df[column].value_counts(normalize=True)
#     df[column] = df[column].map(freq)
#     return df

# # Apply frequency encoding to the specified columns
# for col in Colums_considered:
#     X_train = frequency_encoding(X_train, col)
#     X_test = frequency_encoding(X_test, col)

#pipeline

In [None]:
# pipeline for five regression models is created. comment the ones that are not required

pipelines = {
    'rf':make_pipeline(RandomForestRegressor(random_state=1234)),
    'gb':make_pipeline(GradientBoostingRegressor(random_state=1234)),
    # 'ridge':make_pipeline(Ridge(random_state=1234)),
    # 'lasso':make_pipeline(Lasso(random_state=1234)),
    # 'enet':make_pipeline(ElasticNet(random_state=1234))

}

hypergrid = {
    'rf':{
        'randomforestregressor__min_samples_split':[50],
        'randomforestregressor__min_samples_leaf':[50]
    # #     # 'randomforestregressor__min_samples_split':[2,4,6],
    # #     # 'randomforestregressor__min_samples_leaf':[1,2,3]
    },
    'gb': {
        'gradientboostingregressor__alpha': [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    # 'ridge':{
    #   'ridge__alpha':[0.001,0.005,0.01,0.05,0.1]
    # },
    # 'lasso':{
    #   'lasso__alpha':[0.001,0.005,0.01,0.05,0.1,0.5,0.99]
    # },
    # 'enet':{
    #     'elasticnet__alpha' : [0.001,0.005,0.01,0.05,0.1,0.5,0.99]
    # }
}

#Testing regression scores with kaggle datasets (ignore this section)

In [None]:
# file_path = '/content/drive/My Drive/6301.501/dataset_with_safety_scores.csv'
# columns_required = ['Zip-Time-Percentage', 'Zipcode-Day-Percentage', 'Zipcode-Percentage', 'Incident_Score']
# df = pd.read_csv(file_path, usecols=columns_required)

# columns_considered = ['Zip-Time-Percentage', 'Zipcode-Day-Percentage', 'Zipcode-Percentage']
# # Columns_considered = ['Day1 of the Week', 'Time Bin', 'Zip Code']
# X = df[columns_considered]
# y = df['Incident_Score']

In [None]:
results = pd.DataFrame()

In [None]:

def save_results(results, y_test, y_pred, model_name, encoding_name):
    results[f'{model_name}_{encoding_name}_y_pred'] = y_pred
    results[f'{model_name}_{encoding_name}_y_test'] = y_test


def frequency_encoding(df, column):
    freq = df[column].value_counts(normalize=True)
    df[column] = df[column].map(freq)
    return df


# Loop through the encoding methods and models
for encoding_name, encoder in encodings.items():

    # Copy the original X data for each encoding iteration
    X_encoded = X.copy()
    # Encode the features
    if encoder == 'label':
        X_encoded = X.copy()
        label_encoders = {col: LabelEncoder() for col in columns_considered}
        for col, label_encoder in label_encoders.items():
            X_encoded[col] = label_encoder.fit_transform(X[col])
    elif encoder == 'target':
        X_encoded = TargetEncoder(cols=columns_considered).fit_transform(X, y)
    elif encoder == 'frequency':
        for col in columns_considered:
            X_encoded = frequency_encoding(X_encoded, col)
    else:
        X_encoded = encoder.fit_transform(X)
        X_encoded = pd.DataFrame(X_encoded.toarray(), columns=encoder.named_transformers_['cat'].get_feature_names_out(columns_considered))

    # Split the data
    X_train, X_test_encoded, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)
    X_train_original, X_test_original, _, y_test_original = train_test_split(X, y, test_size=0.2, random_state=42)

    if 'y_test' not in results.columns:
        results = pd.DataFrame(X_test_original, columns=columns_considered)
        results['y_test'] = y_test
        results['y_test_original'] = y_test_original

    # Loop through the models
    for model_name, pipeline in pipelines.items():
        model = GridSearchCV(pipeline, hypergrid[model_name], cv=10, n_jobs=-1)

        # Fit the model
        print(f'Starting training for {model_name} with {encoding_name} encoding...')
        model.fit(X_train, y_train)
        print(f'{model_name} has been successfully fit with {encoding_name} encoding.')

        # Predict the results
        y_pred = model.predict(X_test_encoded)

        # Save the results to a CSV file
        # save_results(results, y_test, y_pred, model_name, encoding_name)
        # Print the results
        print(f'{model_name} scores with {encoding_name} encoding - R2: {r2_score(y_test, y_pred)}  MAE: {mean_absolute_error(y_test, y_pred)}')
        # temp_results = pd.DataFrame()
        results[f'{model_name}_{encoding_name}_y_pred'] = y_pred
        results[f'{model_name}_{encoding_name}_y_test'] = y_test

# Save the final results to a CSV file
results.to_csv('/content/drive/My Drive/6301.501/results.csv', index=False)

Starting training for rf with label encoding...
rf has been successfully fit with label encoding.
rf scores with label encoding - R2: 0.012336007504987756  MAE: 18.411957455284842
Starting training for gb with label encoding...
gb has been successfully fit with label encoding.
gb scores with label encoding - R2: 0.010191230675212215  MAE: 18.481882964834718
Starting training for rf with target encoding...
rf has been successfully fit with target encoding.
rf scores with target encoding - R2: 0.012496432373384936  MAE: 18.41017037569183
Starting training for gb with target encoding...
gb has been successfully fit with target encoding.
gb scores with target encoding - R2: 0.012713185834549834  MAE: 18.4236266924323
Starting training for rf with frequency encoding...
rf has been successfully fit with frequency encoding.
rf scores with frequency encoding - R2: 0.012313031656951967  MAE: 18.412178722217615
Starting training for gb with frequency encoding...
gb has been successfully fit with