In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# ----------------------------------------
# Step 1: Load Dataset
# ----------------------------------------
df = pd.read_csv("../files/iris.csv")  # Make sure the file is in the same directory

# ----------------------------------------
# (e) Data Cleaning
# ----------------------------------------

# Replace '?' with NaN
df.replace('?', np.nan, inplace=True)

# Convert numeric columns to proper type
for col in df.columns:
    if col != 'variety':
        df[col] = pd.to_numeric(df[col], errors='coerce')

# Drop rows with missing values
df.dropna(inplace=True)

# Remove negative values
numeric_cols = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']
df = df[(df[numeric_cols] >= 0).all(axis=1)]

# ----------------------------------------
# (f) Outlier Detection and Removal (IQR)
# ----------------------------------------

def remove_outliers_iqr(data, column):
    Q1 = data[column].quantile(0.25)
    Q3 = data[column].quantile(0.75)
    IQR = Q3 - Q1
    return data[(data[column] >= Q1 - 1.5 * IQR) & (data[column] <= Q3 + 1.5 * IQR)]

for col in numeric_cols:
    df = remove_outliers_iqr(df, col)

# ----------------------------------------
# (g) Data Transformation
# ----------------------------------------

# Encode target column
le = LabelEncoder()
df['variety'] = le.fit_transform(df['variety'])

# Normalize feature columns
scaler = MinMaxScaler()
df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

# ----------------------------------------
# (h) Model Building & Accuracy Comparison
# ----------------------------------------

# Features and labels
X = df[numeric_cols]
y = df['variety']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(max_iter=200)
log_model.fit(X_train, y_train)
log_preds = log_model.predict(X_test)
log_accuracy = accuracy_score(y_test, log_preds)

# Naive Bayes
nb_model = GaussianNB()
nb_model.fit(X_train, y_train)
nb_preds = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_preds)

# Print results
print("\nModel Accuracy Comparison:")
print(f"Logistic Regression Accuracy: {log_accuracy * 100:.2f}%")
print(f"Naive Bayes Accuracy: {nb_accuracy * 100:.2f}%")



Model Accuracy Comparison:
Logistic Regression Accuracy: 93.33%
Naive Bayes Accuracy: 93.33%
