### Linear Regression on Finals


#### Install Dependencies

In [None]:
#!pip install streamlit-option-menu

In [None]:
#!pip install streamlit_folium

#### Import Required Libraries

In [None]:
# import
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import sklearn.metrics as sm
from sklearn.metrics import r2_score

#### Load Cleaned Dataset

In [None]:
finalists_clean_df = pd.read_csv('Data/finalists_cleaned.csv')

#### Quick Look at the Data
Preview the first few rows to verify that the dataset has been loaded correctly

In [None]:
# Display the first few rows to verify
print(finalists_clean_df.head())

#### Dataset Dimensions
Check the shape of the dataset to understand how many rows and columns it contains.

In [None]:
finalists_clean_df.shape

#### Data Types 
Get a concise summary of the dataset

In [None]:
finalists_clean_df.info()

#### Define Features and Target Variable
Set the prediction target (final_place) and drop non-numeric and non-informative columns from the features.

In [None]:
y = finalists_clean_df['final_place']

# Features: drop target and categorical variables
X = finalists_clean_df.drop(columns=['final_place', 'country', 'style','year', 'final_total_points'])

# Define numeric feature names (all remaining)
numeric_features = X.columns.tolist()


In [None]:
y.head()

#### Split the Dataset
Split the dataset into training and test sets using a 74%/26% ratio to train and evaluate the model.

In [None]:
# Split the dataset (74/26)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.26, random_state=42)


Print the shape of each split to ensure the dataset was divided correctly.

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

#### Build Modeling Pipeline
Create a Scikit-learn Pipeline that includes imputation for missing values and a linear regression model.



In [None]:
model = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

#### Train the Model
Fit the linear regression pipeline on the training data.

In [None]:
# Fit and predict
model.fit(X_train, y_train)


In [None]:
reg = model.named_steps['regressor']


#### Inspect Model Parameters
Extract and print the model’s intercept and coefficients to understand how each feature contributes to the prediction.

In [None]:
print("Intercept:", reg.intercept_)
print("Coefficients:", reg.coef_)

feature_names = X_train.columns
coef_table = list(zip(feature_names, reg.coef_))
for name, coef in coef_table:
    print(f"{name:30} {coef:>10.4f}")


#### Make Predictions
Use the trained model to predict final placements on the test set.

In [None]:
y_pred = model.predict(X_test)

#### Evaluate Model Performance
Import a custom evaluation function and print metrics like R², MAE, and RMSE to assess how well the model performs.

In [None]:
import sys
import os

# Use the current working directory instead of __file__
current_directory = os.getcwd()

# Add the path to the 'Modules' folder (adjust the path to reach the Modules folder)
sys.path.append(os.path.join(current_directory, 'Modules'))

# Now import the evalute function
from machine_learning.evaluate import evaluate_model

# Assuming you have your model, X_test, and y_test defined somewhere
evaluation_results = evaluate_model(model, X_test, y_test)

# Print the evaluation metrics in a formatted way
print("Model Evaluation Results:")
print("----------------------------")
for metric, value in evaluation_results.items():
    if metric != 'predictions':
        print(f"{metric}: {value:.4f}")
    else:
        print(f"\nPredictions (showing first 10 for brevity):")
        print(", ".join([f"{v:.2f}" for v in value[:10]]))  # Only show the first 10 predictions

The model explains about 74.5% of the variance in final placement. By removing some coloumns and tweeking the test size, we found this was the highest R2. the test-size ended on 74/26

In [None]:
y_pred

#### Visualize Predictions
Plot a scatter plot comparing predicted and actual final placements to evaluate model accuracy visually.

In [None]:
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', alpha=0.6)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--')  # Diagonal
plt.xlabel('Actual Final Place')
plt.ylabel('Predicted Final Place')
plt.title('Predicted vs Actual Final Place')
plt.grid(True)
plt.show()

#### Compare Specific Prediction
Display a single value from y_test and its corresponding prediction to see the model’s precision on a specific sample.

In [None]:
y_test.iloc[0]

In [None]:
y_pred[0]

## Store model

In [None]:
import joblib

In [None]:
# Store the model in a file
model_file = 'Models/finalistfit.pkl'

In [None]:
# save the model
joblib.dump(model, model_file)