In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv('Iris.csv')

# Display first few rows
df.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


In [2]:
# Step 1: Data Cleaning
# Check missing values
df.columns = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species']
print("Missing Values:\n", df.isnull().sum())

# Check "?" entries
print("Question Mark Entries:\n", (df == '?').sum())

# Check negative values
numeric_cols = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']
print("Negative Values:\n", (df[numeric_cols] < 0).sum())

Missing Values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64
Question Mark Entries:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
species         0
dtype: int64
Negative Values:
 sepal_length    0
sepal_width     0
petal_length    0
petal_width     0
dtype: int64


In [3]:
# Step 2: Error Correction (Remove outliers using IQR)
def remove_outliers(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return data[(data[column] >= lower_bound) & (data[column] <= upper_bound)]

for col in numeric_cols:
    df = remove_outliers(df, col)

print("\nShape after outlier removal:", df.shape)


Shape after outlier removal: (145, 5)


In [5]:
# Step 3: Data Transformation (Min-Max Scaling)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

scaler = MinMaxScaler()
X = df[numeric_cols]
X_scaled = scaler.fit_transform(X)

# Encode target variable
le = LabelEncoder()
y = le.fit_transform(df['species'])

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 5: Model Building
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

log_model = LogisticRegression()
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

log_acc = accuracy_score(y_test, log_pred)
print("Logistic Regression Accuracy:", log_acc)


from sklearn.naive_bayes import GaussianNB  # Use GaussianNB for continuous features

nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)

nb_acc = accuracy_score(y_test, nb_pred)
print("Naïve Bayes Accuracy:", nb_acc)

print("Logistic Regression Accuracy:", log_acc)
print("Naïve Bayes Accuracy:", nb_acc)


Logistic Regression Accuracy: 0.9310344827586207
Naïve Bayes Accuracy: 0.9310344827586207
Logistic Regression Accuracy: 0.9310344827586207
Naïve Bayes Accuracy: 0.9310344827586207
