In [None]:
# 1. Importing Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.metrics import classification_report, accuracy_score
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.colors import ListedColormap
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

import os


In [None]:
dataSetPath = r"..\..\1_DataSets\iris.csv" 

dataset = pd.read_csv(dataSetPath) 


print(f"Dataset Columns: {dataset.columns.to_list()}")
print("****************************")
print(f"Dataset Shape: {dataset.shape}")
print("****************************")

totalNumberOfRowInActualDataset = len(dataset)
print(f"Total Rows in dataset File: {totalNumberOfRowInActualDataset}")

In [None]:
dataset.drop_duplicates(inplace=True)
print(f"Dataset Shape after removing duplcates: {dataset.shape}")
print("****************************")

numberOfRowAfterRemovingDuplicates = len(dataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingDuplicates}")

print("****************************")
print(f"Total Duplicates: {(totalNumberOfRowInActualDataset - numberOfRowAfterRemovingDuplicates)}")

In [None]:
# Step 2: Handle missing values (remove rows with missing values)
cleanedDataset = dataset.dropna()
numberOfRowAfterRemovingNullValues = len(cleanedDataset)
print(f"Remaining Rows in Dataset: {numberOfRowAfterRemovingNullValues}")

print("****************************")
print(f"Removed Number of rows containing Null Values: {( numberOfRowAfterRemovingDuplicates - numberOfRowAfterRemovingNullValues)}")

In [None]:
cleanedDataset.head()

In [None]:
cleanedDataset.info()

In [None]:
unique_classes = cleanedDataset[cleanedDataset.columns.to_list()[-1]].unique()
print("Unique Target Classes:", unique_classes)

In [None]:
# Encode the target column if it's categorical
if cleanedDataset[cleanedDataset.columns.to_list()[-1]].dtype == 'object':
    cleanedDataset[cleanedDataset.columns.to_list()[-1]] = cleanedDataset[cleanedDataset.columns.to_list()[-1]].astype('category').cat.codes


In [None]:
cleanedDataset.info()

In [None]:
cleanedDataset.head()

In [None]:
# 5. Exploratory Data Analysis (EDA) and Plotting
# Pairplot for feature visualization
sns.pairplot(cleanedDataset, hue='variety', diag_kind='kde')
plt.suptitle("Feature Relationships", y=1.02)
plt.show()




In [None]:
# Scatterplot of features
firstFeatureIndex = 0
secondFeatureIndex = 1
plt.figure(figsize=(10, 8))
sns.scatterplot(x=cleanedDataset[cleanedDataset.columns.to_list()[firstFeatureIndex]], y=cleanedDataset[cleanedDataset.columns.to_list()[secondFeatureIndex]], hue=cleanedDataset[cleanedDataset.columns.to_list()[-1]], palette='deep', s=70)
plt.title(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]} vs {cleanedDataset.columns.to_list()[secondFeatureIndex]}")
plt.show()

In [None]:
# Scatterplot of features
firstFeatureIndex = 0
secondFeatureIndex = 2
plt.figure(figsize=(10, 8))
sns.scatterplot(x=cleanedDataset[cleanedDataset.columns.to_list()[firstFeatureIndex]], y=cleanedDataset[cleanedDataset.columns.to_list()[secondFeatureIndex]], hue=cleanedDataset[cleanedDataset.columns.to_list()[-1]], palette='deep', s=70)
plt.title(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]} vs {cleanedDataset.columns.to_list()[secondFeatureIndex]}")
plt.show()

In [None]:
# Scatterplot of features
firstFeatureIndex = 0
secondFeatureIndex = 3
plt.figure(figsize=(10, 8))
sns.scatterplot(x=cleanedDataset[cleanedDataset.columns.to_list()[firstFeatureIndex]], y=cleanedDataset[cleanedDataset.columns.to_list()[secondFeatureIndex]], hue=cleanedDataset[cleanedDataset.columns.to_list()[-1]], palette='deep', s=70)
plt.title(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]} vs {cleanedDataset.columns.to_list()[secondFeatureIndex]}")
plt.show()

In [None]:
# Scatterplot of features
firstFeatureIndex = 1
secondFeatureIndex = 2
plt.figure(figsize=(10, 8))
sns.scatterplot(x=cleanedDataset[cleanedDataset.columns.to_list()[firstFeatureIndex]], y=cleanedDataset[cleanedDataset.columns.to_list()[secondFeatureIndex]], hue=cleanedDataset[cleanedDataset.columns.to_list()[-1]], palette='deep', s=70)
plt.title(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]} vs {cleanedDataset.columns.to_list()[secondFeatureIndex]}")
plt.show()

In [None]:
# Scatterplot of features
firstFeatureIndex = 1
secondFeatureIndex = 3
plt.figure(figsize=(10, 8))
sns.scatterplot(x=cleanedDataset[cleanedDataset.columns.to_list()[firstFeatureIndex]], y=cleanedDataset[cleanedDataset.columns.to_list()[secondFeatureIndex]], hue=cleanedDataset[cleanedDataset.columns.to_list()[-1]], palette='deep', s=70)
plt.title(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]} vs {cleanedDataset.columns.to_list()[secondFeatureIndex]}")
plt.show()

In [None]:
# Scatterplot of features
firstFeatureIndex = 2
secondFeatureIndex = 3
plt.figure(figsize=(10, 8))
sns.scatterplot(x=cleanedDataset[cleanedDataset.columns.to_list()[firstFeatureIndex]], y=cleanedDataset[cleanedDataset.columns.to_list()[secondFeatureIndex]], hue=cleanedDataset[cleanedDataset.columns.to_list()[-1]], palette='deep', s=70)
plt.title(f"{cleanedDataset.columns.to_list()[firstFeatureIndex]} vs {cleanedDataset.columns.to_list()[secondFeatureIndex]}")
plt.show()

In [None]:
# 6. Splitting Data (40% for Testing, 60% for Training)
X = cleanedDataset.iloc[:, :-1]  # Features
Y = cleanedDataset.iloc[:, -1]   # Target
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=42)

print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)


In [None]:
# 7. Instantiating LR Model
model = LinearRegression()
model.fit(X, Y)

# Generate predictions (the regression line)
Y_pred = model.predict(X_test)


In [None]:

# Convert regression predictions to nearest integer class labels
y_pred_classes = np.rint(Y_pred).astype(int)  # Round to nearest integer
y_pred_classes = np.clip(y_pred_classes, 0, (len(unique_classes)-1))  # Ensure valid class indices

print(f"length of Y_pred: {len(Y_pred)}")
print(f"length of y_pred_classes: {len(y_pred_classes)}")
print(f"length of y_test: {len(y_test)}")
# Compute confusion matrix
cm = confusion_matrix(y_test, y_pred_classes)

# Plot the confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=unique_classes)
disp.plot(cmap='Blues', values_format='d')

# Add title and labels
plt.title("Confusion Matrix for Linear Regression on Iris Dataset", fontsize=14, fontweight="bold")
plt.xlabel("Predicted Labels")
plt.ylabel("True Labels")
plt.show()