In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

# STEP 0: Importing Libraries

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import statistics as stat

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
from IPython.display import Image

url = "https://s3.amazonaws.com/assets.datacamp.com/blog_assets/Machine+Learning+R/iris-machinelearning.png"
Image(url=url)

# STEP 1: Reading the data Iris.csv

In [None]:
df = pd.read_csv("/kaggle/input/iris/Iris.csv")
df.rename(index=df.Id, inplace=True)
df.drop("Id", axis=1, inplace=True)

# STEP 2: Data Understanding

In [None]:
df

In [None]:
df.shape

There is 150 rows and 5 columns

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.columns

In [None]:
df.nunique()

# STEP 3: Data Preparation

Need to perform the following steps to clean this data.

1. Checking for duplicates and removing it
2. Checking for null values and removing it
3. Dropping irrelevant rows and columns

Checking for duplicates

In [None]:
df.duplicated().sum()

Found 3 duplicates

In [None]:
df = df.drop_duplicates()

Removed the duplicates

Checking for null values

In [None]:
df.isna().sum()

No null values so nothing to remove 

In [None]:
df.describe()

# STEP 4: Data Modelling

1. Renaming columns
2. Feature Creation
3. Finding the outlier

Renaming the columns

In [None]:
df.rename({"SepalLengthCm":"sepal_length",
            "SepalWidthCm":"sepal_width",
            "PetalLengthCm":"petal_length",
            "PetalWidthCm":"petal_width",
            "Species":"species"}, axis=1, inplace=True)

In [None]:
df.sepal_length.value_counts().sum()

In [None]:
df['species'].unique()

In [None]:
df.columns

In [None]:
df['species'].value_counts()

Finding the Outlier

* Boxplot
A boxplot, also known as a box-and-whisker plot, provides a visual summary of the distribution of a dataset. It displays several descriptive statistics such as the median, quartiles, and potential outliers.


Box plot showing outliers

In [None]:
# Defining a custom palette with shades of purple
custom_palette = ['#9370DB', '#FFA500', '#8A2BE2']

# Creating the  box plot with custom colors
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, palette=custom_palette)
plt.title('Boxplot of Data', fontsize=14)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Values', fontsize=12)
plt.show()

We can see that there are some outliers for sepal_width so we need to remove the outliers 

In [None]:
# Step 1: Calculating the Quartiles and IQR
Q1 = df['sepal_width'].quantile(0.25)
Q3 = df['sepal_width'].quantile(0.75)
IQR = Q3 - Q1

# Step 2: Defining the Outlier Fences
upper_fence = Q3 + 1.5 * IQR
lower_fence = Q1 - 1.5 * IQR

print("Upper Fence:", upper_fence)
print("Lower Fence:", lower_fence)

# Step 3: Identifying and Remove Outliers
outliers = df[(df['sepal_width'] < lower_fence) | (df['sepal_width'] > upper_fence)]
df= df[(df['sepal_width'] >= lower_fence) & (df['sepal_width'] <= upper_fence)]

print("\nIdentified outliers:")
print(outliers)



In [None]:
# Defining a custom palette with shades of purple
custom_palette = ['#9370DB', '#FFA500', '#8A2BE2']

# Creating the  box plot with custom colors
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, palette=custom_palette)
plt.title('Boxplot of Data', fontsize=14)
plt.xlabel('Features', fontsize=12)
plt.ylabel('Values', fontsize=12)
plt.show()

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
df['species'] = encoder.fit_transform(df['species'])

In [None]:
df['species'].value_counts()

# STEP 5: Exploratory Data Analysis

UNIVARIATE ANALYSIS

1. Categorical variables can be visualized using a Count plot, Bar Chart, Pie Plot, etc.

2. Numerical Variables can be visualized using Histogram, Box Plot, Density Plot, etc.

Observations

1. Most sepal length has length in range (5.2-6.3)cm
2. Most Sepal width has range (2.9-3.1)cm
3. Most petal length has length in range (1-1.5 and 5)cm
4. Most petal width has range (0-0.25 and 1.2-1.5)cmNumerical Variables can be visualized using Histogram, Box Plot, Density Plot, etc.

In [None]:
# Define a custom palette with shades of purple
custom_palette = ['#9370DB', '#8A2BE2', '#7B68EE', '#6A5ACD', '#483D8B']  # Purple shades

plt.figure(figsize=(8, 6))
plt.hist(df['sepal_length'], bins=20, color=custom_palette[0], edgecolor='black')
plt.title('Histogram of Sepal Length', fontsize=14)
plt.xlabel('Sepal Length', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.show()

In [None]:
# Define a custom palette with shades of purple
custom_palette = ['#9370DB', '#8A2BE2', '#7B68EE', '#6A5ACD', '#483D8B']  # Shades of purple

plt.figure(figsize=(12, 8))
n_features = len(df.columns[:-1])
n_rows = n_features // 2 + n_features % 2  # Calculate the number of rows needed

for i, feature in enumerate(df.columns[:-1]):
    plt.subplot(n_rows, 2, i+1)  # Adjust subplot layout dynamically
    sns.histplot(df[feature], kde=True, color=custom_palette[i % len(custom_palette)])
    plt.title(f'Distribution of {feature}', fontsize=12)
    plt.xlabel(feature, fontsize=10)
    plt.ylabel('Frequency', fontsize=10)

plt.tight_layout()
plt.show()


Observations

1. Most sepal length has length in range (5.2-6.3)cm
2. Most Sepal width has range (2.9-3.1)cm
3. Most petal length has length in range (1-1.5 and 5)cm
4. Most petal width has range (0-0.25 and 1.2-1.5)cm

In [None]:
custom_palette = ['#9370DB', '#8A2BE2', '#7B68EE', '#6A5ACD']  # Shades of purple

plt.figure(figsize=(15, 10))

plt.subplot(2, 2, 1)
sns.violinplot(x='species', y='petal_length', data=df, palette=custom_palette)
plt.title('Violin Plot of Petal Length', fontsize=12)
plt.xlabel('Species', fontsize=10)
plt.ylabel('Petal Length (cm)', fontsize=10)

plt.subplot(2, 2, 2)
sns.violinplot(x='species', y='petal_width', data=df, palette=custom_palette)
plt.title('Violin Plot of Petal Width', fontsize=12)
plt.xlabel('Species', fontsize=10)
plt.ylabel('Petal Width (cm)', fontsize=10)

plt.subplot(2, 2, 3)
sns.violinplot(x='species', y='sepal_length', data=df, palette=custom_palette)
plt.title('Violin Plot of Sepal Length', fontsize=12)
plt.xlabel('Species', fontsize=10)
plt.ylabel('Sepal Length (cm)', fontsize=10)

plt.subplot(2, 2, 4)
sns.violinplot(x='species', y='sepal_width', data=df, palette=custom_palette)
plt.title('Violin Plot of Sepal Width', fontsize=12)
plt.xlabel('Species', fontsize=10)
plt.ylabel('Sepal Width (cm)', fontsize=10)

plt.tight_layout()
plt.show()

In [None]:

# Selecting only numeric columns
numeric_columns =df.select_dtypes(include=['float64', 'int64'])

# Define a custom palette with shades of purple
custom_palette = ['#E0E0FF', '#B0B0E6', '#9370DB', '#7B68EE', '#6A5ACD']  # Shades of purple

# Create correlation heatmap with custom colors
plt.figure(figsize=(8, 6))
sns.heatmap(numeric_columns.corr(), annot=True, cmap=custom_palette, linewidths=0.5)
plt.title('Correlation Heatmap of Iris Dataset', fontsize=14)
plt.show()



BIVARIATE ANALYSIS

Plotting the pair plot 

In [None]:
# Defining a custom palette with shades of purple and a contrasting color
custom_palette = ['#9370DB', '#8A2BE2','#FFA500']  # Purple shades + Orange as a contrast

# Plotting the pair plot with custom colors
sns.pairplot(df, hue='species', palette=custom_palette)

# Adjust plot parameters (if needed)
plt.title('Pair Plot with Custom Purple Palette', fontsize=14)
plt.show()


In [None]:
# Define a custom palette with shades of purple
custom_palette = ['#9370DB', '#8A2BE2', '#7B68EE']  # Purple shades

plt.figure(figsize=(8, 6))
sns.barplot(x='species', y='sepal_length', data=df, ci=None, palette=custom_palette)
plt.title('Bar Plot of Sepal Length by Species', fontsize=14)
plt.xlabel('Species', fontsize=12)
plt.ylabel('Sepal Length', fontsize=12)
plt.show()

In [None]:
# Define a custom palette with shades of purple
custom_palette = ['#9370DB', '#8A2BE2', '#7B68EE']  # Purple shades

plt.figure(figsize=(8, 6))
sns.barplot(x='species', y='petal_length', data=df, ci=None, palette=custom_palette)
plt.title('Bar Plot of Petal Length by Species', fontsize=14)
plt.xlabel('Species', fontsize=12)
plt.ylabel('PetalLength', fontsize=12)
plt.show()

# STEP 6: TRAIN-TEST SPLIT

In [None]:
df.info()

In [None]:
df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into features (X) and target (y)
X = df.drop('species', axis=1)
y = df['species']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
df.shape

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
X_train.info()

In [None]:
X_train.describe()

In [None]:
# Plotting histograms for each feature
plt.figure(figsize=(15, 10))
for i, feature in enumerate(X_train.columns):
    plt.subplot(2, 2, i+1)  # Adjust subplot layout dynamically
    plt.hist(X_train[feature], bins=18, color=custom_palette[i % len(custom_palette)], edgecolor='black', linewidth=1.5)
    plt.title(f'Histogram of {feature}', fontsize=12)
    plt.xlabel(feature, fontsize=10)
    plt.ylabel('Frequency', fontsize=10)

# Adjusting layout and labels
plt.tight_layout()
plt.show()

### Analysis of Boxplot Charts

The upper row of the boxplot chart below suggests that the distributions of Sepal Length and Sepal Width among different flowers are quite similar to each other. However, Iris-setosa stands out with the most extreme values for both features.

The bottom row reinforces our observation from the histograms above. The Iris-setosa flower exhibits distinct characteristics in Petal Length and Width compared to the other two flowers. Additionally, it is evident that Iris-setosa has a much narrower distribution (represented by shorter boxplots) in these features compared to Iris-versicolor and Iris-virginica, which have longer boxplots.

These findings imply that the Iris-setosa flower may be the easiest to classify due to its distinguishable features from the other two flowers. However, further analysis is needed to confirm this assumption.




# STEP 7: Model Creation

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn import metrics 

# 7.1  Logistic Regression 

In [None]:
from sklearn.linear_model import LogisticRegression
LR=LogisticRegression()
LR.fit(X_train,y_train)
y_pred_lr=LR.predict(X_test)

# 7.2 Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)
y_pred_dt = dt.predict(X_test)

# 7.3 K- Nearest Neighbors (KNN)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5)
knn.fit(X_train, y_train)
y_pred_knn = knn.predict(X_test)


# 7.4 Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# 7.5 Naive Bayes

In [None]:
from sklearn.naive_bayes import GaussianNB
nb = GaussianNB()
nb.fit(X_train, y_train)
y_pred_nb = nb.predict(X_test)

# STEP 8 : Model Selection

In [None]:
import pandas as pd
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score
import matplotlib.pyplot as plt

# Initialize an empty DataFrame for model comparison
model_comparison = pd.DataFrame(columns=['Model', 'Accuracy', 'Precision', 'Recall', 'F1 Score'])
# Define a custom palette with shades of purple
custom_palette = ['#9370DB', '#8A2BE2', '#7B68EE', '#6A5ACD', '#483D8B']

# Logistic Regression
accuracy_lr = accuracy_score(y_test, y_pred_lr)
precision_lr = precision_score(y_test, y_pred_lr, average='weighted')
recall_lr = recall_score(y_test, y_pred_lr, average='weighted')
f1_lr = f1_score(y_test, y_pred_lr, average='weighted')

model_comparison = pd.concat([model_comparison, pd.DataFrame({
    'Model': ['Logistic Regression'],
    'Accuracy': [accuracy_lr],
    'Precision': [precision_lr],
    'Recall': [recall_lr],
    'F1 Score': [f1_lr]
})], ignore_index=True)

# Decision Trees
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')

model_comparison = pd.concat([model_comparison, pd.DataFrame({
    'Model': ['Decision Trees'],
    'Accuracy': [accuracy_dt],
    'Precision': [precision_dt],
    'Recall': [recall_dt],
    'F1 Score': [f1_dt]
})], ignore_index=True)

# Random Forest
accuracy_rf = accuracy_score(y_test, y_pred_rf)
precision_rf = precision_score(y_test, y_pred_rf, average='weighted')
recall_rf = recall_score(y_test, y_pred_rf, average='weighted')
f1_rf = f1_score(y_test, y_pred_rf, average='weighted')

model_comparison = pd.concat([model_comparison, pd.DataFrame({
    'Model': ['Random Forest'],
    'Accuracy': [accuracy_rf],
    'Precision': [precision_rf],
    'Recall': [recall_rf],
    'F1 Score': [f1_rf]
})], ignore_index=True)

# Naive Bayes
accuracy_nb = accuracy_score(y_test, y_pred_nb)
precision_nb = precision_score(y_test, y_pred_nb, average='weighted')
recall_nb = recall_score(y_test, y_pred_nb, average='weighted')
f1_nb = f1_score(y_test, y_pred_nb, average='weighted')

model_comparison = pd.concat([model_comparison, pd.DataFrame({
    'Model': ['Naive Bayes'],
    'Accuracy': [accuracy_nb],
    'Precision': [precision_nb],
    'Recall': [recall_nb],
    'F1 Score': [f1_nb]
})], ignore_index=True)

# K-Nearest Neighbors (KNN)
accuracy_knn = accuracy_score(y_test, y_pred_knn)
precision_knn = precision_score(y_test, y_pred_knn, average='weighted')
recall_knn = recall_score(y_test, y_pred_knn, average='weighted')
f1_knn = f1_score(y_test, y_pred_knn, average='weighted')

model_comparison = pd.concat([model_comparison, pd.DataFrame({
    'Model': ['K-Nearest Neighbors (KNN)'],
    'Accuracy': [accuracy_knn],
    'Precision': [precision_knn],
    'Recall': [recall_knn],
    'F1 Score': [f1_knn]
})], ignore_index=True)

# svm
accuracy_dt = accuracy_score(y_test, y_pred_dt)
precision_dt = precision_score(y_test, y_pred_dt, average='weighted')
recall_dt = recall_score(y_test, y_pred_dt, average='weighted')
f1_dt = f1_score(y_test, y_pred_dt, average='weighted')

model_comparison = pd.concat([model_comparison, pd.DataFrame({
    'Model': ['svm'],
    'Accuracy': [accuracy_dt],
    'Precision': [precision_dt],
    'Recall': [recall_dt],
    'F1 Score': [f1_dt]
})], ignore_index=True)

# Sort the DataFrame by Accuracy
model_comparison = model_comparison.sort_values(by='Accuracy', ascending=False)

# Print the model comparison table
print(model_comparison)

# Bar plot of Accuracy scores with shades of purple
plt.figure(figsize=(8, 6))
plt.bar(model_comparison['Model'], model_comparison['Accuracy'], color=custom_palette)
plt.xlabel('Model')
plt.ylabel('Accuracy')
plt.title('Model Comparison: Accuracy')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

# STEP 9 : Conclusion 

## The KNN Model has and accuracy of 95.35%