In [140]:
import numpy as np
import pandas as pd

In [141]:
df = pd.read_csv('iris_dataset.csv')
df

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [142]:
# df = pd.read_csv('Iris.csv',header=None)
# df.head()
df.columns = ['SepalLengthCm','SepalWidthCm','PetalLengthCm','PetalWidthCm','Species']
df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [143]:
df['Species'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

# Data cleaning(Remove NA, ?, Negative values etc.)

In [144]:
df.shape

(150, 5)

In [145]:
df[df== '?'].sum()

SepalLengthCm    0.0
SepalWidthCm     0.0
PetalLengthCm    0.0
PetalWidthCm     0.0
Species            0
dtype: object

In [146]:
df.isnull().sum()

SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [147]:
df = df.replace('?', np.nan)
df = df.dropna()

In [148]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


# Data Transformation

In [149]:
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [150]:
le = LabelEncoder()
le.fit(df['Species'])
le.classes_

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [151]:
df['Species'] = le.transform(df['Species'])

In [152]:
df

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


# f. Error correcting(Outlier detection and removal)

In [153]:
def remove_outlier(data):
    z_scores = np.abs((data - data.mean()) / data.std())
    out = z_scores <= 3  # Use <= instead of >
    data = data[out.all(axis=1)]  # Use out.all(axis=1) to remove rows where all elements are True
    return data

filtered_data = remove_outlier(df)

In [154]:
filtered_data.shape

(149, 5)

# h. Build Data model using regression and Naïve Bayes methods and compare accuracy of Iris Species Prediction.

In [155]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [156]:

# Split the data into training and testing sets
X = filtered_data.drop('Species', axis=1)
y = filtered_data['Species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=3, random_state=42)

In [157]:

# Train the logistic regression model
logistic_regression_model = LogisticRegression()
logistic_regression_model.fit(X_train, y_train)


In [158]:
# Train the Naive Bayes model
naive_bayes_model = GaussianNB()
naive_bayes_model.fit(X_train, y_train)

In [159]:

# Predict the species using both models
y_pred_logistic_regression = logistic_regression_model.predict(X_test)
y_pred_naive_bayes = naive_bayes_model.predict(X_test)

In [160]:

# Calculate the accuracy of both models
accuracy_logistic_regression = accuracy_score(y_test, y_pred_logistic_regression)
accuracy_naive_bayes = accuracy_score(y_test, y_pred_naive_bayes)


In [161]:
# Compare the accuracies of the two models
print("Accuracy - Logistic Regression : ", accuracy_logistic_regression)
print("Accuracy - Naïve Bayes : ", accuracy_naive_bayes)

Accuracy - Logistic Regression :  1.0
Accuracy - Naïve Bayes :  1.0
