In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor

from sklearn.model_selection import GridSearchCV

In [2]:
df = pd.read_csv('mars-weather.csv')

# Drop
df = df.drop('wind_speed', axis=1)
df = df.drop('atmo_opacity', axis=1)
df = df.dropna()

In [3]:
df['terrestrial_date'] = pd.to_datetime(df['terrestrial_date'])
# Extract year, month and day
df['terrestrial_date_year'] = df['terrestrial_date'].dt.year
df['terrestrial_date_month'] = df['terrestrial_date'].dt.month
df['terrestrial_date_day'] = df['terrestrial_date'].dt.day
# Drop the original 'terrestrial_date' column
df = df.drop(['terrestrial_date'], axis=1)

# Convert 'month' from categorical to numerical
le = LabelEncoder()
df['month'] = le.fit_transform(df['month'])

features = df.drop(['max_temp'], axis=1)
# Define Target
target = df['max_temp']

In [4]:
df

Unnamed: 0,id,sol,ls,month,min_temp,max_temp,pressure,terrestrial_date_year,terrestrial_date_month,terrestrial_date_day
0,1895,1977,135,7,-77.0,-10.0,727.0,2018,2,27
1,1893,1976,135,7,-77.0,-10.0,728.0,2018,2,26
2,1894,1975,134,7,-76.0,-16.0,729.0,2018,2,25
3,1892,1974,134,7,-77.0,-13.0,729.0,2018,2,24
4,1889,1973,133,7,-78.0,-18.0,730.0,2018,2,23
...,...,...,...,...,...,...,...,...,...,...
1887,46,14,157,8,-74.0,-16.0,740.0,2012,8,20
1888,35,13,157,8,-74.0,-15.0,732.0,2012,8,19
1889,24,12,156,8,-76.0,-18.0,741.0,2012,8,18
1890,13,11,156,8,-76.0,-11.0,740.0,2012,8,17


In [5]:
# scaler = MinMaxScaler()
# features = scaler.fit_transform(features)

# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2)

In [6]:
# Random Forest and Decision Tree Models Hyperparameters tuning with GridSearchCV
random_forest_model = RandomForestRegressor()
decision_tree_model = DecisionTreeRegressor()

# Grid of hyperparameters for RandomForestRegressor
rf_param_grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 10, 15],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5, 10, 15],
}

# Apply GridSearchCV
rf_grid_search = GridSearchCV(random_forest_model, rf_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
rf_grid_search.fit(X_train, y_train)

# Get the optimal hyperparameters
rf_best_params = rf_grid_search.best_params_
print(f"Best parameters for Random Forest: {rf_best_params}")


# Grid of hyperparameters for DecisionTreeRegressor
dt_param_grid = {
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 5, 10, 15],
}

# Apply GridSearchCV
dt_grid_search = GridSearchCV(decision_tree_model, dt_param_grid, cv=5, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
dt_grid_search.fit(X_train, y_train)

# Get the optimal hyperparameters
dt_best_params = dt_grid_search.best_params_
print(f"Best parameters for Decision Tree: {dt_best_params}")

Fitting 5 folds for each of 180 candidates, totalling 900 fits
Best parameters for Random Forest: {'max_depth': 15, 'min_samples_leaf': 2, 'min_samples_split': 10, 'n_estimators': 100}
Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best parameters for Decision Tree: {'max_depth': 10, 'min_samples_leaf': 10, 'min_samples_split': 5}


In [7]:
# Random Forest and Decision Tree Models
random_forest_model = RandomForestRegressor(max_depth=10, min_samples_leaf=1, min_samples_split=2, n_estimators=500)
decision_tree_model = DecisionTreeRegressor(max_depth=10, min_samples_leaf=5, min_samples_split=15)

# Fit the model
random_forest_model.fit(X_train, y_train)
decision_tree_model.fit(X_train, y_train)

# Predict
rf_predictions_train = random_forest_model.predict(X_train)
rf_predictions_test = random_forest_model.predict(X_test)
dt_predictions_train = decision_tree_model.predict(X_train)
dt_predictions_test = decision_tree_model.predict(X_test)

# Evaluate
print("\nRandom Forest Regression Train:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_train, rf_predictions_train))
print("Mean Squared Error (MSE):", mean_squared_error(y_train, rf_predictions_train))
print("R2 Score:", r2_score(y_train, rf_predictions_train))
print("\nRandom Forest Regression Test:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, rf_predictions_test))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, rf_predictions_test))
print("R2 Score:", r2_score(y_test, rf_predictions_test))

print("\nDecision Tree Regression Train:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_train, dt_predictions_train))
print("Mean Squared Error (MSE):", mean_squared_error(y_train, dt_predictions_train))
print("R2 Score:", r2_score(y_train, dt_predictions_train))
print("\nDecision Tree Regression Test:")
print("Mean Absolute Error (MAE):", mean_absolute_error(y_test, dt_predictions_test))
print("Mean Squared Error (MSE):", mean_squared_error(y_test, dt_predictions_test))
print("R2 Score:", r2_score(y_test, dt_predictions_test))


Random Forest Regression Train:
Mean Absolute Error (MAE): 1.2698918626583102
Mean Squared Error (MSE): 2.6897748369734766
R2 Score: 0.9767048574642097

Random Forest Regression Test:
Mean Absolute Error (MAE): 2.3742112874244756
Mean Squared Error (MSE): 9.394280522624454
R2 Score: 0.9143372774933072

Decision Tree Regression Train:
Mean Absolute Error (MAE): 1.6788211864001092
Mean Squared Error (MSE): 4.9006496993063156
R2 Score: 0.9575572900400195

Decision Tree Regression Test:
Mean Absolute Error (MAE): 2.5898303939032905
Mean Squared Error (MSE): 10.708362867012971
R2 Score: 0.9023546811734295


In [8]:
# Get feature importances
importances = random_forest_model.feature_importances_

f_importances = pd.Series(importances, df.drop(['max_temp'], axis=1).columns)
f_importances = f_importances.sort_values(ascending=False)

print("Feature importance ranking by Random Forest Model:")
print(f_importances)

Feature importance ranking by Random Forest Model:
ls                        0.849736
pressure                  0.045580
month                     0.033178
sol                       0.020981
id                        0.018660
terrestrial_date_day      0.014389
min_temp                  0.012219
terrestrial_date_month    0.004405
terrestrial_date_year     0.000851
dtype: float64


In [9]:
# Get feature importances
importances = decision_tree_model.feature_importances_

f_importances = pd.Series(importances, df.drop(['max_temp'], axis=1).columns)
f_importances = f_importances.sort_values(ascending=False)

print("Feature importance ranking by Decision Tree Model:")
print(f_importances)

Feature importance ranking by Decision Tree Model:
ls                        0.857569
pressure                  0.059444
month                     0.033474
sol                       0.022941
id                        0.013870
terrestrial_date_day      0.008498
min_temp                  0.002815
terrestrial_date_month    0.001390
terrestrial_date_year     0.000000
dtype: float64
