<div style="display: flex; align-items: center; justify-content: center; flex-wrap: wrap;">
    <div style="flex: 1; max-width: 400px; display: flex; justify-content: center;">
        <img src="https://i.ibb.co/JBPWVYR/Logo-Nova-IMS-Black.png" style="max-width: 50%; height: auto; margin-top: 50px; margin-bottom: 50px;margin-left: 3rem;">
    </div>
    <div style="flex: 2; text-align: center; margin-top: 20px;">
        <div style="font-size: 28px; font-weight: bold; line-height: 1.2;">
            <span style="color: #08306B;">ML Project |</span> <span style="color: #08529C;">To Grant or Not to Grant</span>
        </div>
        <div style="font-size: 17px; font-weight: bold; margin-top: 10px;">
            Fall Semester | 2024 - 2025
        </div>
        <div style="font-size: 17px; font-weight: bold;">
            Master in Data Science and Advanced Analytics
        </div>
        <div style="margin-top: 20px;">
            <div>André Silvestre, 20240502</div>
            <div>João Henriques, 20240499</div>
            <div>Simone Genovese, 20241459</div>
            <div>Steven Carlson, 20240554</div>
            <div>Vinícius Pinto, 20211682</div>
            <div>Zofia Wojcik, 20240654</div>
        </div>
        <div style="margin-top: 20px; font-weight: bold;">
            TBL Group 33
        </div>
    </div>
</div>

## 📚 Libraries Import


In [1]:
# For data
import pandas as pd
import numpy as np
import os

# For plotting
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.ticker as mtick
import matplotlib.lines as mlines
import missingno as msno

# For data preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

# For modeling
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold, learning_curve


# For evaluation
from collections import Counter
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_auc_score, roc_curve, auc
from sklearn.model_selection import cross_val_score


# Set the style of the visualization
pd.set_option('display.max_columns', None) # display all columns

# for better resolution plots
%config InlineBackend.figure_format = 'retina' # optionally, you can change 'svg' to 'retina'

# Setting seaborn style
sns.set_theme(style="white")

## 🧮 Import Databases

In [2]:
# Importing the dataset
# train_data = pd.read_parquet('data/')
# test_data = pd.read_parquet('data/test_data_1.csv')

In [3]:
# Display the first 2 rows of each dataset
# train_data.head(2) 

In [4]:
# test_data.head(2)

# <a class='anchor' id='3'></a>
<br>
<style>
@import url('https://fonts.cdnfonts.com/css/avenir-next-lt-pro?styles=29974');
</style>

<div style="background: linear-gradient(to right,#08529C, #08306B); 
            padding: 10px; color: white; border-radius: 300px; text-align: center;">
    <center><h1 style="margin-left: 140px;margin-top: 10px; margin-bottom: 4px; color: white;
                       font-size: 32px; font-family: 'Avenir Next LT Pro', sans-serif;">
        <b>3 | Modeling & Evaluation</b></h1></center>
</div>

<br><br>

#### **📈📉 Data Normalization**

In [None]:
# # For the 'KNN' and 'Logistic Regression' models, let's normalize the variables
# from sklearn.preprocessing import MinMaxScaler
# 
# X_train_norm = MinMaxScaler().fit_transform(X_train)
# X_validation_norm = MinMaxScaler().fit_transform(X_validation)
# X_test_norm = MinMaxScaler().fit_transform(X_test)

# **💡 Modeling**

In [4]:
# Function that will print the results of the classification report and the confusion matrix for both datasets (train and validation)
def classification_metrics(y_train, pred_train , y_val, pred_val):
    print('___________________________________________________________________________________________________________')
    print('                                                     TRAIN                                                 ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_train, pred_train, target_names=[str(i) for i in np.unique(y_train)]))

    print('___________________________________________________________________________________________________________')
    print('                                                VALIDATION                                                 ')
    print('-----------------------------------------------------------------------------------------------------------')
    print(classification_report(y_val, pred_val, target_names=[str(i) for i in np.unique(y_val)]))
    
    
    # Confusion Matrix for Train and Validation side by side
    fig, ax = plt.subplots(1,2, figsize=(12,4))
    sns.heatmap(confusion_matrix(y_train, pred_train, labels=np.unique(y_val)),
                annot=True, fmt='d', cmap='Blues', cbar=False, 
                xticklabels=np.unique(y_val), yticklabels=np.unique(y_val),
                ax=ax[0])
    ax[0].set_title('Confusion matrix [Train]', fontsize=14, fontweight='bold')
    ax[0].set_xlabel('\nPredicted', fontsize=8, fontweight='bold')
    ax[0].set_yticklabels(rotation=0)
    ax[0].set_ylabel('True\n', fontsize=8, fontweight='bold')
    
    sns.heatmap(confusion_matrix(y_val, pred_val, labels=np.unique(y_val)),
                annot=True, fmt='d', cmap='Blues', cbar=False, 
                xticklabels=np.unique(y_val), yticklabels=np.unique(y_val),
                ax=ax[1])
    ax[1].set_title('Confusion matrix [Validation]', fontsize=14, fontweight='bold')
    ax[1].set_xlabel('\nPredicted', fontsize=8, fontweight='bold')
    ax[1].set_yticklabels(rotation=0)
    ax[1].set_ylabel('True\n', fontsize=8, fontweight='bold')
    plt.show()

### **Logistic Regression[<sup>[1]</sup>](https://scikit-learn.org/1.5/modules/generated/sklearn.linear_model.LogisticRegression.html)**

### **Naive Bayes[<sup>[2]</sup>](https://scikit-learn.org/1.5/modules/generated/sklearn.naive_bayes.GaussianNB.html)**

In [None]:
# # Naive Bayes
# nb = GaussianNB()
# cv = cross_val_score(nb,X_train_norm,y_train,cv=5)

# # Train the model
# knn.fit(X_train_norm, y_train)
# 
# # Test the model
# predicted_prob = knn.predict_proba(X_test_norm)[:,1]
# predicted = knn.predict(X_test_norm)
# y_proba = knn.predict_proba(X_test_norm)
# 
# # Accuracy
# accuracy = metrics.accuracy_score(y_test, predicted)
# print("Accuracy (overall correct predictions):",  round(accuracy,2))
# 
# # AUC
# auc = metrics.roc_auc_score(y_test, predicted_prob)
# print("AUC:", round(auc,2))

### **K-Nearest Neighbors (KNN)[<sup>[3]</sup>](https://scikit-learn.org/1.5/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)**

In [None]:
# # KNN | K-Nearest Neighbors
# knn = KNeighborsClassifier()
# cv = cross_val_score(knn,X_train_norm,y_train,cv=5)


### **Decision Tree[<sup>[4]</sup>](https://scikit-learn.org/1.5/modules/generated/sklearn.tree.DecisionTreeClassifier.html)**

### **Random Forest[<sup>[5]</sup>](https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.RandomForestClassifier.html)**

### **Support Vector Machine (SVM)[<sup>[6]</sup>](https://scikit-learn.org/1.5/modules/generated/sklearn.svm.SVC.html)**

### **Gradient Boosting[<sup>[7]</sup>](https://scikit-learn.org/1.5/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)**

### **XGBoost[<sup>[8]</sup>](https://xgboost.readthedocs.io/en/latest/python/python_api.html)**

---

### <a class='anchor' id='3_1'></a> **🧪 Model Selection**

---

### <a class='anchor' id='3_2'></a> **📏 Model Evaluation**

---

### <a class='anchor' id='3_3'></a> **📈 Model Optimization**

---

### 📊 Feature Importance

---

## 🔮 Test Data Prediction

---

## 📋 CSV Export - Submission

In [20]:
# Each submission in Kaggle should have different names and should have the version name in the following format 
# – GroupXX_VersionXX.csv (for example, Group 1 submitted his 8th version, the file should be named Group01__Version08.csv)


# submission_data.to_csv('submissions/Group33_Version01.csv', index=False, header=True, sep=',')