In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline

In [2]:
data = pd.read_csv("../input/iris-flower-dataset/IRIS.csv")

In [3]:
data.shape

(150, 5)

In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal_length  150 non-null    float64
 1   sepal_width   150 non-null    float64
 2   petal_length  150 non-null    float64
 3   petal_width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [5]:
data.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [6]:
data.isnull().sum()

sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64

In [7]:
data.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width',
       'species'],
      dtype='object')

In [8]:
mean = np.mean(data['sepal_width'])
std = np.std(data['sepal_width'])

data['width-Z-Score'] = (data['sepal_width'] - mean) / std
data.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,width-Z-Score
0,5.1,3.5,1.4,0.2,Iris-setosa,1.032057
1,4.9,3.0,1.4,0.2,Iris-setosa,-0.124958
2,4.7,3.2,1.3,0.2,Iris-setosa,0.337848
3,4.6,3.1,1.5,0.2,Iris-setosa,0.106445
4,5.0,3.6,1.4,0.2,Iris-setosa,1.26346


In [9]:
print(f"Here are the outliers based on the z-score threshold, 3:\n {data[data['width-Z-Score'] > 3]}")

Here are the outliers based on the z-score threshold, 3:
     sepal_length  sepal_width  petal_length  petal_width      species  \
15           5.7          4.4           1.5          0.4  Iris-setosa   

    width-Z-Score  
15       3.114684  


In [10]:
data_1 = data[['sepal_length','sepal_width','petal_length','petal_width']]
data_1 = pd.DataFrame(data_1)
means = data_1.mean()
stds = data_1.std()
z_scores = (data - means) / stds
threshold = 3  
outliers = (z_scores.abs() > threshold).any(axis=1)
print("Rows with outliers:")
print(data_1[outliers])

Rows with outliers:
    sepal_length  sepal_width  petal_length  petal_width
15           5.7          4.4           1.5          0.4


In [11]:
df_no_outliers = data_1[~outliers]
print("DataFrame without outliers:")
print(df_no_outliers.head(5))

DataFrame without outliers:
   sepal_length  sepal_width  petal_length  petal_width
0           5.1          3.5           1.4          0.2
1           4.9          3.0           1.4          0.2
2           4.7          3.2           1.3          0.2
3           4.6          3.1           1.5          0.2
4           5.0          3.6           1.4          0.2


In [12]:
merged_df = pd.merge(df_no_outliers, data, left_index=True, right_index=True)
merged_df.head(5)

Unnamed: 0,sepal_length_x,sepal_width_x,petal_length_x,petal_width_x,sepal_length_y,sepal_width_y,petal_length_y,petal_width_y,species,width-Z-Score
0,5.1,3.5,1.4,0.2,5.1,3.5,1.4,0.2,Iris-setosa,1.032057
1,4.9,3.0,1.4,0.2,4.9,3.0,1.4,0.2,Iris-setosa,-0.124958
2,4.7,3.2,1.3,0.2,4.7,3.2,1.3,0.2,Iris-setosa,0.337848
3,4.6,3.1,1.5,0.2,4.6,3.1,1.5,0.2,Iris-setosa,0.106445
4,5.0,3.6,1.4,0.2,5.0,3.6,1.4,0.2,Iris-setosa,1.26346


In [13]:
merged_df = merged_df[['sepal_length_x','sepal_width_x','petal_length_x','petal_width_x','species']]

In [14]:
merged_df.columns = merged_df.columns.str.replace('_x', '')

In [15]:
df = merged_df.copy()

In [16]:
df.head(5)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [17]:
unique_summary = df.apply(lambda col: col.unique())
print(unique_summary)

sepal_length    [5.1, 4.9, 4.7, 4.6, 5.0, 5.4, 4.4, 4.8, 4.3, ...
sepal_width     [3.5, 3.0, 3.2, 3.1, 3.6, 3.9, 3.4, 2.9, 3.7, ...
petal_length    [1.4, 1.3, 1.5, 1.7, 1.6, 1.1, 1.2, 1.0, 1.9, ...
petal_width     [0.2, 0.4, 0.3, 0.1, 0.5, 0.6, 1.4, 1.5, 1.3, ...
species            [Iris-setosa, Iris-versicolor, Iris-virginica]
dtype: object


In [18]:
df['species'].dtype

dtype('O')

In [19]:
df['species'] = df['species'].astype('category')
df['species'].dtype


CategoricalDtype(categories=['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], ordered=False)

In [20]:
X = data[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']]
y = data['species']
 
models = {
    'LogisticRegression': (LogisticRegression(), {'C': [0.1, 1, 10]}),
    'KNeighborsClassifier': (KNeighborsClassifier(), {'n_neighbors': range(1, 21)}),
    'DecisionTreeClassifier': (DecisionTreeClassifier(random_state=42), {'max_depth': [None, 5, 10]}),
    'RandomForestClassifier': (RandomForestClassifier(random_state=42), {'n_estimators': [10, 50, 100], 'max_depth': [None, 5, 10, 15]}),
}

model_results = {}

for name, (model, params) in models.items():
    pipeline = Pipeline([
        ('scaler', StandardScaler()),
        ('model', GridSearchCV(model, params, cv=5, scoring='accuracy', n_jobs=-1))
    ])
    
    cv_scores = cross_val_score(pipeline, X, y, cv=5, scoring='accuracy')
    mean_cv_accuracy = cv_scores.mean()
    
    pipeline.fit(X, y)
    best_model = pipeline.named_steps['model'].best_estimator_
    
    model_results[name] = {'Cross-Validation Accuracy': mean_cv_accuracy}
    
    print(f"\nModel: {name}")
    print(f"Cross-Validation Mean Accuracy: {mean_cv_accuracy:.2f}")
    print(f"Best Parameters: {best_model.get_params()}")

best_model_name = max(model_results, key=lambda model: model_results[model]['Cross-Validation Accuracy'])

print(f"\nBest Model: {best_model_name} with Metrics: {model_results[best_model_name]}")



Model: LogisticRegression
Cross-Validation Mean Accuracy: 0.97
Best Parameters: {'C': 10, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}

Model: KNeighborsClassifier
Cross-Validation Mean Accuracy: 0.95
Best Parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 6, 'p': 2, 'weights': 'uniform'}

Model: DecisionTreeClassifier
Cross-Validation Mean Accuracy: 0.95
Best Parameters: {'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'random_state': 42, 'splitter': 'best'}

Model: RandomForestClassifier
Cross-Validation M

In [21]:
import pickle
save_path = r'C:\Users\Mr.J10\Desktop\Full_Model\Iris_model_02.pkl'
with open(save_path, 'wb') as file:
    pickle.dump(best_model, file)

In [22]:
import pickle
save_path = r'C:\Users\Mr.J10\Desktop\Full_Model\Iris_model_02.pkl'
model_load = pickle.load(open(save_path, 'rb'))

In [23]:
model_load.predict([[4.6,3.1,1.5,0.2]])

array(['Iris-virginica'], dtype=object)