In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

> # **Iris Flower Classification**

# **Import Libraries**

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# **Import Dataset**

In [None]:
df = pd.read_csv('/kaggle/input/iris-flower-dataset/IRIS.csv')
df

# **Data preprocessing**

In [None]:
df.columns

In [None]:
df.head(5)

In [None]:
df.tail()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.shape

In [None]:
df.duplicated()

In [None]:
df.duplicated().sum()

In [None]:
null_values = df.isna().sum()
null_values

In [None]:
df.drop_duplicates(inplace = True)

In [None]:
print(df.groupby(['species']).count())

In [None]:
df.shape

In [None]:
# Mode of the specie column
species_mode = df['species'].mode()
print(f'Mode of Species :{species_mode}')


# **EDA( Exploratery Data Analysis )**

In [None]:
df.hist()
plt.show()

In [None]:
import warnings
warnings.filterwarnings("ignore", category=FutureWarning)
features = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
plt.figure(figsize=(15, 10))

# Loop through the features and create a plot for each
for i, feature in enumerate(features, 1):
    plt.subplot(2, 2, i)
    sns.histplot(df, x=feature, hue='species', kde=True, palette='bright', multiple='stack')
    plt.title(f'Distribution of {feature.replace("_", " ").title()} by Species')

plt.show()

In [None]:
species = df['species'].value_counts()

plt.figure(figsize = (5,4))
plt.bar(species.index, species.values, color = 'pink')
plt.title('distribution of species')
plt.show()

In [None]:
sns.pairplot(df, hue='species', markers=["o", "s", "D"])
plt.show()

**Diagonal Plot: Shows the distribution of individual features (histogram/KDE).**

**Off-Diagonal Scatter Plot: Shows pairwise relationships between different features using scatter plots.**

In [None]:
plt.figure(figsize=(4, 3))
sns.boxplot(x='species', y='sepal_length', data=df)
plt.title('Sepal Length Distribution by Species')
plt.show()

In [None]:
plt.figure(figsize=(4, 3))
sns.boxplot(x='species', y='sepal_width', data=df)
plt.title('Sepal width Distribution by Species')
plt.show()

In [None]:
plt.figure(figsize=(4, 3))
sns.boxplot(x='species', y='petal_length', data=df)
plt.title('petal Length Distribution by Species')
plt.show()

In [None]:
plt.figure(figsize=(4, 3))
sns.boxplot(x='species', y='petal_width', data=df)
plt.title('petal width Distribution by Species')
plt.show()

In [None]:
plt.figure(figsize=(14,10))
plt.subplot(2,2,1)
sns.violinplot(x='species',y='petal_length',data=df)
plt.subplot(2,2,2)
sns.violinplot(x='species',y='petal_width',data=df)
plt.subplot(2,2,3)
sns.violinplot(x='species',y='sepal_length',data=df)
plt.subplot(2,2,4)
sns.violinplot(x='species',y= 'sepal_width',data=df)

**Violin Plot:** A combination of a box plot and a kernel density plot. It shows both the distribution and the probability density of the data for each species:

* The wider sections represent where more data points are concentrated.
* The thinner sections represent where fewer data points are found.

In [None]:
corr_matrix = df.drop(columns=['species']).corr()
corr_matrix

In [None]:
# Plot heatmap
plt.figure(figsize=(6,6))
sns.heatmap(corr_matrix, annot=True, cmap='pink', square=True, linewidths=1)
plt.title('Feature Correlation Heatmap')
plt.show()

# **Model Selection**

In [None]:
from sklearn.preprocessing import LabelEncoder

le=LabelEncoder()

df['species'] = le.fit_transform(df['species'])

In [None]:
X = df.drop('species', axis=1)
y = df['species']

In [None]:
X.head()

In [None]:
# Checking the Balance in data
y.value_counts(normalize=True) * 100 


In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3,random_state=42)

In [None]:
# Import different Models

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

## **LogisticRegression**

In [None]:
lr = LogisticRegression(C = 100, max_iter=1000)
lr.fit(X_train , y_train)
lr_score = lr.score(X_test , y_test)
print(lr_score * 100)

# **DecisionTreeClassifier**

In [None]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train , y_train)
dtc_score = dtc.score(X_test , y_test)
print(dtc_score * 100)

# **RandomForestClassifier**

In [None]:
rfc = RandomForestClassifier()
rfc.fit(X_train , y_train)
rfc_score = rfc.score(X_test , y_test)
print(rfc_score * 100)

## **GaussianNB**

In [None]:
gnb = GaussianNB()
gnb.fit(X_train , y_train)
gnb_score = gnb.score(X_test , y_test)
print(gnb_score * 100)

## **KNeighborsClassifier**

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, y_train)
knn_score = knn.score(X_test, y_test)
print(knn_score * 100)

## **SVC**

In [None]:
svc = SVC()
svc.fit(X_train, y_train)
svc_score = svc.score(X_test, y_test)
print(svc_score * 100)

In [None]:
# Make predictions

model_evaluation = {}
models = [lr,dtc,rfc, gnb, knn, svc]
model_results = {}

# Make Predictions
for model in models:
    model_results[str(model).split("(")[0]] = [model.predict(X_test)]
    
print(model_results)

In [None]:
# Import metrics for evaluation

from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, precision_score, recall_score

In [None]:
for model, preds in model_results.items():
    model_evaluation[model] = [
                            round(accuracy_score(y_test, pd.DataFrame(preds).T) * 100, 2),
                            round(f1_score(y_test, pd.DataFrame(preds).T, average='macro') * 100, 2),
                            round(precision_score(y_test, pd.DataFrame(preds).T, average='macro') * 100, 2),
                            round(recall_score(y_test, pd.DataFrame(preds).T, average='macro') * 100, 2),
    ]

In [None]:
print(model_evaluation)

In [None]:
results_df = pd.DataFrame(model_evaluation, index=["Accuracy", "F-1 Score", "Precision Score", "Recall Score"])
results_df.style.background_gradient(axis=None, cmap='pink')

In [None]:
# Transpose the DataFrame so that models are columns and metrics are rows
results_df_transposed = results_df.T


#plt.figure(figsize=(4, 4))
results_df_transposed.plot(kind='bar', figsize=(7, 6), cmap='pink', edgecolor='black')

# Set titles and labels
plt.title('Model Performance Comparison', fontsize=16)
plt.xlabel('Models', fontsize=14)
plt.ylabel('Scores (%)', fontsize=14)

# Rotate x-axis labels for better readability
plt.xticks(rotation=45)

# Display the legend and plot
plt.legend(loc='lower right')
plt.tight_layout()
plt.show()