In [2]:
import pandas as pd  # type: ignore
import matplotlib.pyplot as plt  # type: ignore
import numpy as np  # type: ignore

In [8]:
df = pd.read_csv('./../processed_data/filtered_data_with_classes.csv')
print(df.head(1))

   temperature_celsius  condition_text  wind_kph  wind_degree  pressure_mb  \
0              0.46875               0       0.0     0.016854     0.470588   

   precip_mm  humidity  cloud  feels_like_celsius  visibility_km  uv_index  \
0   0.229592       1.0   0.75            0.371257       0.308176       0.0   

   gust_kph  air_quality_us-epa-index  sunrise_num  sunset_num  
0  0.187697                       0.0     0.302469    0.527638  


In [9]:
df.head(5)

Unnamed: 0,temperature_celsius,condition_text,wind_kph,wind_degree,pressure_mb,precip_mm,humidity,cloud,feels_like_celsius,visibility_km,uv_index,gust_kph,air_quality_us-epa-index,sunrise_num,sunset_num
0,0.46875,0,0.0,0.016854,0.470588,0.229592,1.0,0.75,0.371257,0.308176,0.0,0.187697,0.0,0.302469,0.527638
1,0.375,1,0.045126,0.410112,0.676471,0.0,0.875,0.0,0.311377,0.308176,0.0,0.165615,0.0,0.308642,0.517588
2,0.5,2,0.00722,0.859551,0.588235,0.0,1.0,0.5,0.391218,0.308176,0.0,0.227129,0.2,0.320988,0.492462
3,0.4375,1,0.0,0.168539,0.647059,0.0,0.875,0.0,0.351297,0.308176,0.0,0.119874,0.4,0.395062,0.39196
4,0.4375,2,0.0,0.22191,0.441176,0.0,0.71875,0.25,0.351297,0.308176,0.0,0.119874,0.0,0.376543,0.537688


In [12]:
from sklearn.model_selection import train_test_split # type: ignore
from sklearn.ensemble import RandomForestClassifier  # type: ignore
from sklearn.metrics import accuracy_score  # type: ignore
from sklearn.preprocessing import LabelEncoder  # type: ignore

In [13]:
# Encoding categorical variables (if 'condition_text' is categorical)
label_encoder = LabelEncoder()
df['condition_text'] = label_encoder.fit_transform(df['condition_text'])

# Separating features and target variable
X = df.drop('condition_text', axis=1)  # Features
y = df['condition_text']  # Target

# Step 2: Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42)

In [14]:
# Step 3: Create and train the model
model = RandomForestClassifier(n_estimators=100, random_state=42, )
model.fit(X_train, y_train)

# Step 4: Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

Accuracy: 0.72


In [15]:
params = model.get_params()
print(params)

{'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 'sqrt', 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'monotonic_cst': None, 'n_estimators': 100, 'n_jobs': None, 'oob_score': False, 'random_state': 42, 'verbose': 0, 'warm_start': False}


In [17]:
from sklearn.model_selection import GridSearchCV # type: ignore

# the parameter grid
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8]
}

# Initialize the GridSearchCV object
grid_search = GridSearchCV(estimator=RandomForestClassifier(
    random_state=42), param_grid=param_grid, cv=5, n_jobs=-1, verbose=3, scoring='accuracy')

# Fit the model
grid_search.fit(X_train, y_train)

# Best parameters
print("Best parameters:", grid_search.best_params_)

# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Improved Accuracy: {accuracy:.4f}')

Fitting 5 folds for each of 192 candidates, totalling 960 fits
Best parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Improved Accuracy: 0.69


In [18]:
# Accessing the cv_results_
cv_results = grid_search.cv_results_
# Sorting the combinations based on mean test score in descending order
sorted_indices = cv_results['mean_test_score'].argsort()[::-1]

# Iterating through the top 5 combinations and printing their details
for rank, index in enumerate(sorted_indices[:5], start=1):
    print(f"Rank {rank}: Combination {index + 1}")
    print(f"Parameters: {cv_results['params'][index]}")
    print(f"Mean Test Score (Accuracy): {cv_results['mean_test_score'][index]:.4f}\n")

Rank 1: Combination 105
Parameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Mean Test Score (Accuracy): 0.7155

Rank 2: Combination 9
Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Mean Test Score (Accuracy): 0.7155

Rank 3: Combination 153
Parameters: {'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Mean Test Score (Accuracy): 0.7155

Rank 4: Combination 57
Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 200}
Mean Test Score (Accuracy): 0.7129

Rank 5: Combination 53
Parameters: {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Mean Test Score (Accuracy): 0.7129



In [None]:
model = RandomForestClassifier(n_estimators=200, max_depth=20, min_samples_split=10,
                               min_samples_leaf=1, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')