In [1]:
import pandas as pd
import json
import numpy as np

import joblib

pd.set_option('display.max_colwidth',  None)

with open('result.json') as f:
    data = json.load(f)

df = pd.DataFrame(data)

df.fillna(0, inplace=True)
df = df.apply(lambda series: pd.to_numeric(series, errors='coerce'))
df = df.dropna()

In [17]:
y = df.iloc[:, -4]
y

0      24.400000
1      27.685325
2      31.228669
3      35.812133
4      27.685325
         ...    
361    66.393321
365    60.134188
366    60.134188
367    60.134188
368    60.134188
Name: AHEW, Length: 361, dtype: float64

In [18]:
X = df.iloc[:, -3:]
y = df.iloc[:, -4]

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


ss = StandardScaler().fit(X_train)

X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

In [11]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
import numpy as np

# Create a RandomForestRegressor object
rf_model = RandomForestRegressor()

# Train the model
rf_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = rf_model.predict(X_train)

# Evaluate the model on the training set
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)

print("Training set:")
print("RMSE:", train_rmse)
print("R2 Score:", train_r2)
print("MAE:", train_mae)

# Make predictions on the test set
y_test_pred = rf_model.predict(X_test)

# Evaluate the model on the test set
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Test set:")
print("RMSE:", test_rmse)
print("R2 Score:", test_r2)
print("MAE:", test_mae)


Training set:
RMSE: 2.2792847318597316
R2 Score: 0.940926518973459
MAE: 1.19375
Test set:
RMSE: 4.450642557887814
R2 Score: 0.7755994897959183
MAE: 2.3068493150684937


In [8]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.ensemble import GradientBoostingRegressor
import numpy as np

# Create a RandomForestRegressor object
gb_model = GradientBoostingRegressor()

# Train the model
gb_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = gb_model.predict(X_train)

# Evaluate the model on the training set
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)

print("Training set:")
print("RMSE:", train_rmse)
print("R2 Score:", train_r2)
print("MAE:", train_mae)

# Make predictions on the test set
y_test_pred = gb_model.predict(X_test)

# Evaluate the model on the test set
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Test set:")
print("RMSE:", test_rmse)
print("R2 Score:", test_r2)
print("MAE:", test_mae)

Training set:
RMSE: 2.8377778315055435
R2 Score: 0.908430217195037
MAE: 1.678718462032739
Test set:
RMSE: 4.763459971250913
R2 Score: 0.7429465439330458
MAE: 2.7814522616610975


In [12]:
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LinearRegression
import numpy as np

# Create a RandomForestRegressor object
l_model = LinearRegression()

# Train the model
l_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = l_model.predict(X_train)

# Evaluate the model on the training set
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)
train_mae = mean_absolute_error(y_train, y_train_pred)

print("Training set:")
print("RMSE:", train_rmse)
print("R2 Score:", train_r2)
print("MAE:", train_mae)

# Make predictions on the test set
y_test_pred = l_model.predict(X_test)

# Evaluate the model on the test set
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)
test_mae = mean_absolute_error(y_test, y_test_pred)

print("Test set:")
print("RMSE:", test_rmse)
print("R2 Score:", test_r2)
print("MAE:", test_mae)

Training set:
RMSE: 8.014682990769607
R2 Score: 0.26958766482543506
MAE: 6.582505660000943
Test set:
RMSE: 7.371890809336129
R2 Score: 0.3843467236302225
MAE: 6.267692867796305


# Models

In [19]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

# Define the models
models = {
    "linear": LinearRegression(),
    "ridge": Ridge(),
    "lasso": Lasso(),
    "elasticNet": ElasticNet(),
    "svr": SVR(),
    "decisionTree": DecisionTreeRegressor(),
    "knn": KNeighborsRegressor(),
    "mlp": MLPRegressor(max_iter=1000),  # increased max_iter for convergence 
    "polynomial": make_pipeline(PolynomialFeatures(2), LinearRegression()),
    "randomForest": RandomForestRegressor(),
    "gradientBoosting": GradientBoostingRegressor()
}

# Train and evaluate each model
results = {}  # store the test scores
for name, model in models.items():
    model.fit(X_train, y_train)
    train_score = model.score(X_train, y_train)
    test_score = model.score(X_test, y_test)
    results[name] = test_score

# Sort the results and print the best three models
sorted_results = sorted(results.items(), key=lambda item: item[1], reverse=True)
for name, test_score in sorted_results[:3]:
    print(f"Model: {name}")
    print(f"Test score: {test_score}\n")





Model: svr
Test score: -0.03304942287594881

Model: decisionTree
Test score: -0.037659346717045405

Model: elasticNet
Test score: -0.11699903361792319



In [15]:
from sklearn.model_selection import GridSearchCV

# Define the models
models = {
    "randomForest": {
        "model": RandomForestRegressor(),
        "params": {
            'n_estimators': [100, 200, 300],
            'max_depth': [None, 5, 10],
            'min_samples_split': [2, 5, 10],
        }
    },
    "gradientBoosting": {
        "model": GradientBoostingRegressor(),
        "params": {
            'n_estimators': [100, 200, 300],
            'learning_rate': [0.01, 0.1, 0.2],
            'max_depth': [3, 5, 7],
        }
    },
    "mlp": {
        "model": MLPRegressor(max_iter=1000),
        "params": {
            'hidden_layer_sizes': [(50,), (100,)],
            'activation': ['relu', 'tanh'],
            'learning_rate': ['constant', 'invscaling'],
        }
    },
}

# Train and evaluate each model
best_models = []
for name, model in models.items():
    grid_search = GridSearchCV(model["model"], model["params"], cv=5)
    grid_search.fit(X_train, y_train)
    best_models.append((name, grid_search.best_score_, grid_search.best_estimator_))

# Sort the results and print the best three models
best_models.sort(key=lambda x: x[1], reverse=True)
for name, score, model in best_models[:3]:
    print(f"Model: {name}")
    print(f"Best score: {score}")
    print(f"Best parameters: {model.get_params()}\n")




Model: randomForest
Best score: 0.6144004308340165
Best parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'criterion': 'squared_error', 'max_depth': 5, 'max_features': 1.0, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_jobs': None, 'oob_score': False, 'random_state': None, 'verbose': 0, 'warm_start': False}

Model: gradientBoosting
Best score: 0.596568425613024
Best parameters: {'alpha': 0.9, 'ccp_alpha': 0.0, 'criterion': 'friedman_mse', 'init': None, 'learning_rate': 0.01, 'loss': 'squared_error', 'max_depth': 3, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'n_estimators': 300, 'n_iter_no_change': None, 'random_state': None, 'subsample': 1.0, 'tol': 0.0001, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}

Model: mlp
Best score: 0

