## Model Training

#### 1.1 Import Data and Required Packages

##### Importing Pandas, Numpy, Matplotlib, Seaborn, Scikit-learn, Warnings

In [91]:
# Basic Import
import pandas as pd
import numpy as np
import  matplotlib.pyplot as plt
import seaborn as sns
# Modelling
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB


##### Import the csv Data as Pandas Dataframe

In [92]:
df = pd.read_csv('data/penguins.csv').dropna()

##### Show Top 5 Records

In [93]:
df.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male


##### Preparing X and Y

In [94]:
X = df.drop(columns=['species'],axis=1)

In [95]:
X.head()

Unnamed: 0,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Torgersen,39.3,20.6,190.0,3650.0,Male


In [96]:
print("Categories in 'island' variable:     ",end=" " )
print(df['island'].unique())


print("Categories in 'sex' variable:     ",end=" " )
print(df['sex'].unique())


Categories in 'island' variable:      ['Torgersen' 'Biscoe' 'Dream']
Categories in 'sex' variable:      ['Male' 'Female']


In [97]:
y=df['species']

In [98]:
df

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female
5,Adelie,Torgersen,39.3,20.6,190.0,3650.0,Male
...,...,...,...,...,...,...,...
338,Gentoo,Biscoe,47.2,13.7,214.0,4925.0,Female
340,Gentoo,Biscoe,46.8,14.3,215.0,4850.0,Female
341,Gentoo,Biscoe,50.4,15.7,222.0,5750.0,Male
342,Gentoo,Biscoe,45.2,14.8,212.0,5200.0,Female


In [99]:
#Create Column Transformer with 3 types of transformers
num_features = X.select_dtypes(exclude="object").columns
cat_features= X.select_dtypes(include="object").columns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, cat_features),
        ("StandardScaler", numeric_transformer, num_features),
    ]
)

In [100]:
X = preprocessor.fit_transform(X)

In [101]:
from sklearn.preprocessing import LabelEncoder

# Initialize the LabelEncoder
label_encoder = LabelEncoder()

# Fit and transform the target variable (y_train)
label_encoder.fit(y) 
y=label_encoder.transform(y)

In [102]:
# seperate dataset into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
X_train.shape, X_test.shape

((249, 9), (84, 9))

##### Create an Evaluate Function to give all metrices after model Training

In [103]:
def evaluate_model(true, predicted):
    mae = mean_absolute_error(true, predicted)
    mse = mean_squared_error(true, predicted)
    rmse = np.sqrt(mean_squared_error(true, predicted))
    r2_square = r2_score(true, predicted)
    return mae, rmse, r2_square

In [105]:
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest Classifier": RandomForestClassifier(),
    "K-Neighbors Classifier" : KNeighborsClassifier(),
    "Extra Tree Classifier" : ExtraTreeClassifier(),
    "Gradient Boosting Classifier" : GradientBoostingClassifier(),
    "SVC" : SVC(),
    "GaussianNB" : GaussianNB()

}

model_list = []
r2_list = []
for i in range(len(list(models))):
    model=list(models.values())[i]
    model.fit(X_train, y_train) #Train model

    #Make predictions
    y_train_pred = model.predict(X_train)
    y_test_pred = model.predict(X_test)

    # Evaluate Train and Test dataset
    model_train_mae, model_train_rmse, model_train_r2 = evaluate_model(y_train, y_train_pred)

    model_test_mae , model_test_rmse, model_test_r2 = evaluate_model(y_test, y_test_pred)

    print(list(models.keys())[i])
    model_list.append(list(models.keys())[i])
    
    print('Model performance for Training set')
    print("- Root Mean Squared Error: {:.4f}".format(model_train_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_train_mae))
    print("- R2 Score: {:.4f}".format(model_train_r2))

    print('----------------------------------')
    
    print('Model performance for Test set')
    print("- Root Mean Squared Error: {:.4f}".format(model_test_rmse))
    print("- Mean Absolute Error: {:.4f}".format(model_test_mae))
    print("- R2 Score: {:.4f}".format(model_test_r2))
    r2_list.append(model_test_r2)
    
    print('='*35)
    print('\n')


Logistic Regression
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1091
- Mean Absolute Error: 0.0119
- R2 Score: 0.9852


Random Forest Classifier
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2 Score: 1.0000
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1091
- Mean Absolute Error: 0.0119
- R2 Score: 0.9852


K-Neighbors Classifier
Model performance for Training set
- Root Mean Squared Error: 0.0634
- Mean Absolute Error: 0.0040
- R2 Score: 0.9949
----------------------------------
Model performance for Test set
- Root Mean Squared Error: 0.1543
- Mean Absolute Error: 0.0238
- R2 Score: 0.9704


Extra Tree Classifier
Model performance for Training set
- Root Mean Squared Error: 0.0000
- Mean Absolute Error: 0.0000
- R2

In [106]:
pd.DataFrame(list(zip(model_list, r2_list)), columns=['Model Name', 'R2_Score']).sort_values(by=["R2_Score"],ascending=False)

Unnamed: 0,Model Name,R2_Score
0,Logistic Regression,0.985198
1,Random Forest Classifier,0.985198
4,Gradient Boosting Classifier,0.985198
5,SVC,0.985198
2,K-Neighbors Classifier,0.970396
3,Extra Tree Classifier,0.955595
6,GaussianNB,0.289515
