In [10]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

In [11]:
# Loading the dataset
iris  = pd.read_csv("iris.data")
iris.head()

Unnamed: 0,sepal length,sepal width,petal length,petal width,species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [12]:
iris.dtypes

sepal length    float64
sepal width     float64
petal length    float64
petal width     float64
species          object
dtype: object

In [13]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  150 non-null    float64
 1   sepal width   150 non-null    float64
 2   petal length  150 non-null    float64
 3   petal width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [14]:
iris = iris.dropna()  # Remove NA

In [15]:
iris.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   sepal length  150 non-null    float64
 1   sepal width   150 non-null    float64
 2   petal length  150 non-null    float64
 3   petal width   150 non-null    float64
 4   species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [18]:
# Apply the condition only on numeric columns
iris_numeric = iris.select_dtypes(include=[np.number])
iris = iris[(iris_numeric > 0).all(axis=1)]

In [2]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load Iris data
iris = load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['species'] = iris.target

# Simulate dirty data for demo (optional - remove in real use)
# df.loc[0, 'sepal length (cm)'] = -99  # negative value
# df.loc[1, 'sepal width (cm)'] = np.nan  # NA
# df.loc[2, 'petal length (cm)'] = '?'  # Invalid entry

# Clean data

df = df.dropna()
df = df[(df >= 0).all(axis=1)]  # Remove negative values

# Outlier removal using Z-score
from scipy.stats import zscore
df_z = df.drop('species', axis=1).apply(zscore)
df = df[(np.abs(df_z) < 3).all(axis=1)]  # keep data within 3 std dev

# Transform features (normalize)
X = df.drop(['species'], axis=1)
y = df['species']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

# Split data
x_train, x_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Logistic Regression
lr = LogisticRegression()
lr.fit(x_train, y_train)
lr_pred = lr.predict(x_test)

# Naive Bayes
nb = GaussianNB()
nb.fit(x_train, y_train)
nb_pred = nb.predict(x_test)

# Accuracy
lr_acc = accuracy_score(y_test, lr_pred)
nb_acc = accuracy_score(y_test, nb_pred)

print("Logistic Regression Accuracy:", lr_acc)
print("Naive Bayes Accuracy:", nb_acc)
# The zscore function calculates the Z-score for each value in a dataset. The Z-score measures how many standard deviations a data point is from the mean of its feature.
# The formula for the Z-score is:

 
# where:
# 𝑥
# x is the data point,
# 𝜇
# μ is the mean of the feature,
# 𝜎
# σ is the standard deviation of the feature.
# A high absolute Z-score (e.g., > 3) indicates that the data point is far from the mean and may be an outlier.


Logistic Regression Accuracy: 0.9
Naive Bayes Accuracy: 0.8666666666666667


In [None]:
print(X_scaled)

[1 0 2 1 2 0 1 2 1 1 2 0 0 0 0 1 2 2 2 2 0 2 0 2 2 2 2 2 0 0] 74     1
19     0
118    2
79     1
77     1
32     0
65     1
141    2
69     1
83     1
110    2
12     0
37     0
9      0
20     0
57     1
137    2
70     1
56     1
132    2
30     0
124    2
27     0
128    2
129    2
145    2
111    2
102    2
46     0
31     0
Name: species, dtype: int64
