In [97]:
from datetime import datetime

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import numpy as np
from tqdm import tqdm


In [74]:
combined_data = pd.read_csv("./data/combined_data.csv")

In [75]:
combined_data["Grand Final Place"] = combined_data["Grand Final Place"].fillna(27+combined_data["Semifinal Points"])

In [None]:
combined_data.columns[:15]

In [None]:
numeric_features = combined_data.drop(
    ['Country Name',
     'Country Code', 
     'Year', 
     'Grand Final Place', 
     "Song", 
     "Artist", 
     "Grand Final Points", 
     "Semifinal",
     "Semifinal Points",
     "Semifinal Place",
     "Language",
     "index",
     "Use of insecticide-treated bed nets (% of under-5 population)"
     ], 
     axis=1)
target = combined_data['Grand Final Place']
numeric_features.sample(10)

In [78]:
categorical_features = combined_data[['Country Name', 'Language']]

In [79]:
encoder = OneHotEncoder(sparse_output=False)
encoded_categorical_features = encoder.fit_transform(categorical_features)

In [None]:
encoded_categorical_features.shape

In [None]:
feature_names = encoder.get_feature_names_out(categorical_features.columns)
len(feature_names)

In [None]:
encoded_categorical_df = pd.DataFrame(encoded_categorical_features, columns=feature_names)
encoded_categorical_df.sample(10)

In [None]:
numeric_features = pd.concat([numeric_features, encoded_categorical_df], axis=1)
numeric_features.sample(10)



In [84]:
imputer = SimpleImputer(strategy='median')
numeric_features = pd.DataFrame(imputer.fit_transform(numeric_features), columns=numeric_features.columns)

In [85]:
for col in numeric_features.columns:
    if numeric_features[col].isna().sum() != 0:
        print(col)

In [86]:
final_features = pd.concat([numeric_features, encoded_categorical_df], axis=1)
final_data = pd.concat([final_features, combined_data['Grand Final Place'].reset_index(drop=True)], axis=1)


In [87]:
correlation_matrix = final_data.corr()

In [88]:
target_correlation = correlation_matrix['Grand Final Place']

In [89]:
target_correlation_sorted = target_correlation.abs().sort_values(ascending=False)

In [None]:
top_features = target_correlation_sorted.index[1:21] 
print("Top features correlated with the target variable:\n", target_correlation_sorted.head(21))

In [None]:
final_features_top = final_data[top_features]
final_features.sample(20)


In [103]:
target_top = final_data['Grand Final Place']

In [104]:
X_train, X_test, y_train, y_test = train_test_split(final_features, target, test_size=0.2, random_state=42)


In [None]:
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Root Mean Squared Error: {rmse}')

In [None]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf = RandomForestRegressor(random_state=42)
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=3, n_jobs=-1, scoring='neg_mean_squared_error', verbose=2)
grid_search.fit(X_train, y_train)

In [108]:
best_params = grid_search.best_params_
best_score = np.sqrt(-grid_search.best_score_)

In [109]:
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)


In [110]:
cv_scores = cross_val_score(rf, X, y, cv=5, scoring='neg_mean_squared_error')
cv_rmse_scores = np.sqrt(-cv_scores)


In [None]:
print(f'Best Parameters: {best_params}')
print(f'Best RMSE via GridSearchCV: {best_score}')
print(f'Root Mean Squared Error with Best Model: {rmse}')
print(f'Cross-Validation RMSE scores: {cv_rmse_scores}')
print(f'Average Cross-Validation RMSE: {cv_rmse_scores.mean()}')

In [112]:

with open("logs/model_outputs.txt", "a", encoding="UTF-8") as f:
    f.write(f"""Run at {datetime.now()}:
Best Parameters: {best_params}
Best RMSE via GridSearchCV: {best_score}
Root Mean Squared Error with Best Model: {rmse}
Cross-Validation RMSE scores: {cv_rmse_scores}
Average Cross-Validation RMSE: {cv_rmse_scores.mean()}""")
