# Data Preprocessing

## Import Libraries

In [2]:
# Define Numeriacl function
import numpy as np

# Define Dataframe Functions
import pandas as pd

# Define Data Vizulisation
import matplotlib.pyplot as plt
import seaborn as sns

# Define Model Building
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

## Importing DATA

In [4]:
df = pd.read_csv('./data/iris.csv')

## EDA

In [6]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [11]:
df.shape

(150, 6)

In [14]:
df.loc[50]

Id                            51
SepalLengthCm                7.0
SepalWidthCm                 3.2
PetalLengthCm                4.7
PetalWidthCm                 1.4
Species          Iris-versicolor
Name: 50, dtype: object

## Handling Missing Values

In [17]:
df.isna().sum()

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

## Feature Engineering

In [20]:
label_encoder = LabelEncoder()
df['Species'] = label_encoder.fit_transform(df['Species'])

# Feature scaling
scaler = StandardScaler()
scaled_features = scaler.fit_transform(df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']])
# df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm','Species']] = scaled_features

df


Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...,...
145,146,6.7,3.0,5.2,2.3,2
146,147,6.3,2.5,5.0,1.9,2
147,148,6.5,3.0,5.2,2.0,2
148,149,6.2,3.4,5.4,2.3,2


## Handling Outliers

In [23]:
# # Box Plot for each feature
# plt.figure(figsize=(10, 8))
# sns.boxplot(data=df.drop(columns=['Id']))
# plt.title('Box Plot of Features', fontsize=16)
# plt.show()

In [25]:
# # Handling Outliers

# # Calculate Q1 (25th percentile) and Q3 (75th percentile)
# Q1 = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].quantile(0.25)
# Q3 = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']].quantile(0.75)
# IQR = Q3 - Q1

# # Determine outlier bounds
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # Identify outliers
# outliers = ((df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']] < lower_bound) |
#             (df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']] > upper_bound)).any(axis=1)

# # Remove outliers
# df_cleaned = df[~outliers]

# # Output the cleaned DataFrame
# print("Original DataFrame shape:", df.shape)
# print("Cleaned DataFrame shape:", df_cleaned.shape)
# print(df_cleaned)

## Data Vizulisation

In [28]:
# # Box Plot for each feature
# plt.figure(figsize=(10, 8))
# sns.boxplot(data=df_cleaned.drop(columns=['Id']))
# plt.title('Box Plot of Features', fontsize=16)
# plt.show()

## Train Test Split

In [31]:
# Train-Test Split
X = df.drop(columns=['Id', 'Species'])
y = df['Species']
X_train,X_test,y_train,y_test = train_test_split(X, y, test_size=0.2,random_state=42)

In [33]:
y_test.shape

(30,)

## Model Initialization

In [36]:
models = [DecisionTreeClassifier(),KNeighborsClassifier(),LogisticRegression()]

In [38]:
def model_validation():
    for model in models:
        cv_score = cross_val_score(model,X,y,cv=6)
        mean_accuracy = sum(cv_score)/len(cv_score)
        mean_accuracy = mean_accuracy
        mean_accuracy = round(mean_accuracy, 2)

        print(f'cv_score value:\n{cv_score}')
        print(f'mean_accuracy:\n{model} = {mean_accuracy}')
        print("____________________________________________________________________")

In [40]:
print(model_validation())

cv_score value:
[0.96 1.   0.92 0.92 0.96 1.  ]
mean_accuracy:
DecisionTreeClassifier() = 0.96
____________________________________________________________________
cv_score value:
[0.96 1.   0.92 0.92 1.   1.  ]
mean_accuracy:
KNeighborsClassifier() = 0.97
____________________________________________________________________
cv_score value:
[0.96 1.   0.96 0.92 0.96 1.  ]
mean_accuracy:
LogisticRegression() = 0.97
____________________________________________________________________
None


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Inference: For the Iris data set, **LogisticRegression** has the heighest accuracy value in default hyperparameter values.

## Model Training

## Hyper Parameter tuning

In [45]:
tuned_models = [DecisionTreeClassifier(),KNeighborsClassifier(),LogisticRegression()]

In [47]:
params_grid = {

    'Decisiontree_HPT':{ 'criterion' : ["gini", "entropy", "log_loss"],
                           'splitter' : ["best", "random"]},
    
    'KNeighborsClassifier_HPT':{'n_neighbors' : [i for i in np.arange(0,3,1)],
                                'algorithm' : ['ball_tree', 'kd_tree', 'brute'],
                                'weights' : ['uniform', 'distance']},
    
    'LogisticRegression_HPT' : {'penalty' : ['l1', 'l2', 'elasticnet'],
                                'solver' : ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'],
                                'multi_class' : ['ovr', 'multinomial']
         },
}

In [49]:
params_grid.keys()

dict_keys(['Decisiontree_HPT', 'KNeighborsClassifier_HPT', 'LogisticRegression_HPT'])

In [51]:
params_grid.values()

dict_values([{'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random']}, {'n_neighbors': [0, 1, 2], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'weights': ['uniform', 'distance']}, {'penalty': ['l1', 'l2', 'elasticnet'], 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], 'multi_class': ['ovr', 'multinomial']}])

In [53]:
model_keys = list(params_grid.keys())
model_keys[0]

'Decisiontree_HPT'

In [55]:
def model_selection(tuned_models, params_grid):
    result=[]
    i=0
    for model in tuned_models:
        key = model_keys[i]
        params=params_grid[key]
        i += 1
        print(model)
        print(params)
        params_grid_cv = GridSearchCV(model,params,cv=6)
        params_grid_cv.fit(X_train,y_train)
        result.append(
            {'model_used': model,
             'high_score': params_grid_cv.best_score_,
             'best_params': params_grid_cv.best_params_,
        })
        
    return result

In [57]:
model_selection(tuned_models, params_grid)

DecisionTreeClassifier()
{'criterion': ['gini', 'entropy', 'log_loss'], 'splitter': ['best', 'random']}
KNeighborsClassifier()
{'n_neighbors': [0, 1, 2], 'algorithm': ['ball_tree', 'kd_tree', 'brute'], 'weights': ['uniform', 'distance']}


36 fits failed out of a total of 108.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
36 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\neighbors\_classification.py", line 213, in fit
    self._validate_params()
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameter

LogisticRegression()
{'penalty': ['l1', 'l2', 'elasticnet'], 'solver': ['lbfgs', 'liblinear', 'newton-cg', 'newton-cholesky', 'sag', 'saga'], 'multi_class': ['ovr', 'multinomial']}


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
138 fits failed out of a total of 216.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
12 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\anaconda3\Lib\site-pac

[{'model_used': DecisionTreeClassifier(),
  'high_score': 0.9500000000000001,
  'best_params': {'criterion': 'gini', 'splitter': 'best'}},
 {'model_used': KNeighborsClassifier(),
  'high_score': 0.9500000000000001,
  'best_params': {'algorithm': 'ball_tree',
   'n_neighbors': 1,
   'weights': 'uniform'}},
 {'model_used': LogisticRegression(),
  'high_score': 0.975,
  'best_params': {'multi_class': 'multinomial',
   'penalty': 'l1',
   'solver': 'saga'}}]

In [58]:
tuned_hyper_model = LogisticRegression(multi_class='multinomial', penalty= 'l1', solver='saga')

In [59]:
tuned_hyper_model.fit(X_train,y_train)



## evaluation & Result:

In [61]:
# Evaluation
y_pred = tuned_hyper_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"Confussion Matrix: \n{confusion_matrix(y_pred, y_test)}")

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Confussion Matrix: 
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [62]:
y_predc=tuned_hyper_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:\n", classification_report(y_test, y_pred))
print(f"Confussion Matrix: \n{confusion_matrix(y_pred, y_test)}")

Accuracy: 1.0
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00         9
           2       1.00      1.00      1.00        11

    accuracy                           1.00        30
   macro avg       1.00      1.00      1.00        30
weighted avg       1.00      1.00      1.00        30

Confussion Matrix: 
[[10  0  0]
 [ 0  9  0]
 [ 0  0 11]]


In [63]:
# New data
new_data = {
    'Id': [16, 17],
    'SepalLengthCm': [6.1, 0.7],
    'SepalWidthCm': [3.5, 0.4],
    'PetalLengthCm': [5.4, 0.5],
    'PetalWidthCm': [3.2, 0.4]
}
new_df = pd.DataFrame(new_data)

# Drop the 'Id' column
new_X = new_df.drop(columns=['Id'])

# Apply the same scaler used for the training data
new_X_scaled = scaler.transform(new_X)

# Convert the scaled data back to a DataFrame
new_df_scaled = pd.DataFrame(new_X_scaled, columns=new_X.columns)

# Make predictions
new_predictions = tuned_hyper_model.predict(new_df_scaled)

# Convert numerical predictions back to original class labels
new_predictions_labels = label_encoder.inverse_transform(new_predictions)

# Add predictions to the new data
new_df['Species_Predicted'] = new_predictions_labels

print(new_df)

   Id  SepalLengthCm  SepalWidthCm  PetalLengthCm  PetalWidthCm  \
0  16            6.1           3.5            5.4           3.2   
1  17            0.7           0.4            0.5           0.4   

  Species_Predicted  
0    Iris-virginica  
1    Iris-virginica  
