In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Modeling & preprocessing import
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer,make_column_transformer,make_column_selector

# Model Tuning
from sklearn.model_selection import GridSearchCV

# For model evaluation (using Regression metrics)
from sklearn.metrics import mean_absolute_error,mean_squared_error, r2_score

path = '/content/IRIS.csv'
df = pd.read_csv(path)


In [4]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [5]:
df = df.dropna(subset = ['species'], how = 'all')

In [6]:
## Define X and y
target = 'species'

X = df.drop(columns=target).copy()
y = df[target].copy()
X.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [8]:
# Perfoming a train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)

# Defining a list of features for both the categorical and the numerical columns

cat_feature = make_column_selector(dtype_include='object')   # Creating a categorical data selector
num_feature = make_column_selector(dtype_include='number')   # Creating a numeric data selector

# Instantiating the Transformers

impute_cat = SimpleImputer(strategy='most_frequent') #better to keep the ffil but for simplicity reasons I will keep this one
impute_num = SimpleImputer(strategy='median') # we can be more sophisticated and make a mean imputation for widht like we did earlier, but I'll keep it simple for now
ohe_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')   # It is a numinal column with no order to it => So I used the One-hot encoding
scaler = StandardScaler()

# Making pipelines for each category

pip_cat = make_pipeline(impute_cat, ohe_encoder)
pip_num = make_pipeline(impute_num, scaler)

# Defining a tuple for each pathway

categorical_tup = ('Categorical', pip_cat, cat_feature)
numerical_tup = ('Numerical', pip_num, num_feature)

# Instantiating the ColumnTransformer

col_transformer = ColumnTransformer([numerical_tup, categorical_tup], verbose_feature_names_out=False)
col_transformer

In [9]:
# Classification model
from sklearn.linear_model import LogisticRegression

# Instantiate a linear regression model
logreg = LogisticRegression()
# Combine the preprocessing ColumnTransformer and the linear regression model in a Pipeline
model_pipe_LR = make_pipeline(col_transformer, logreg)

# Fit the modeling pipeline on the training data
model_pipe_LR.fit(X_train, y_train)

In [10]:
# Use the model to make predictions for training and testing data to evaluate later on

y_pred_train = model_pipe_LR.predict(X_train)

y_pred_test = model_pipe_LR.predict(X_test)

In [17]:
#evaluation functions

def eval_model(true, pred):
  MAE = mean_absolute_error(true, pred)
  MSE = mean_squared_error(true, pred)
  RMSE = np.sqrt(MSE)
  r2 = r2_score(true, pred)

  print(f'MAE: {MAE:,.2f} \n MSE: {MSE:,.2f} \n RMSE: {RMSE:,.2f} \n R2: {r2:,.2f} ')

In [18]:
print('Train Evaluation''\n')
eval_model(y_train, y_pred_train)
print('\n')

print('Test Evaluation''\n')
eval_model(y_test, y_pred_test)

Train Evaluation



ValueError: could not convert string to float: 'Iris-setosa'

In [None]:
# Random forest Classifier
from sklearn.linear_model import RandomForestClassifier

R_F_default = RandomForestClassifier(n_estimators = 100, max_depth = None)

# Combine the preprocessing ColumnTransformer and the Random Forst model in a Pipeline
model_pipe_RF = make_pipeline(col_transformer, R_F_default)

# Fit the modeling pipeline on the training data
model_pipe_RF.fit(X_train, y_train)

train_preds = model_pipe_RF.predict(X_train)
test_preds = model_pipe_RF.predict(X_test)

train_rf_score = r2_score(y_train, train_preds)
test_rf_score = r2_score(y_test, test_preds)

print(f'- maximum depth of the random forest tree: {max([estimator.get_depth() for estimator in R_F_default.estimators_])}')
print(f'- the number of trees in the foreste? {R_F_default.n_estimators}' '\n')

In [None]:
print('Train Evaluation''\n')
eval_model(y_train, train_preds)
print('\n')
print('Test Evaluation''\n')
eval_model(y_test, test_preds)