In [2]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Data Load

In [3]:
file_path = '/kaggle/input/manipulated-iris-dataset-for-beginners/iris_dirty.csv'
df = pd.read_csv(file_path, header=None)

# Add Headers

In [4]:
header = ['SpLn', 'SpWd', 'PtLn', 'PtWd', 'Species']

df.columns = header
df.head()

Unnamed: 0,SpLn,SpWd,PtLn,PtWd,Species
0,5.1,3.5,1.4,2 mm,Iris-setosa
1,4.9,3.0,1.4,2 mm,Iris-setosa
2,4.7,3.2,1.3,2 mm,Iris-setosa
3,4.6,3.1,1.5,2 mm,Iris-setosa
4,5.0,3.6,1.4,2 mm,Iris-setosa


# Add Case ID

In [5]:
# Adding case_id as the first column
df.insert(0, "Case ID", range(1, len(df) + 1))

# Column PtWd issue

In [6]:
WdList = []

for i in range(0,len(df)):
    width = int(df['PtWd'][i][:-3])/10 #standardization (mm to cm unit conversion)
    WdList.append(width)

df['PtWd'] = WdList

# Save Prepared DataFrame

In [7]:
df.to_csv('iris_clean.csv', index=False)

# Dataset Exploration

In [8]:
# check data types: do all of your features have number type?

df.dtypes

Case ID      int64
SpLn       float64
SpWd       float64
PtLn       float64
PtWd       float64
Species     object
dtype: object

In [9]:
# data statistics

df.describe(include='all')

  has_large_values = (abs_vals > 1e6).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()
  has_small_values = ((abs_vals < 10 ** (-self.digits)) & (abs_vals > 0)).any()


Unnamed: 0,Case ID,SpLn,SpWd,PtLn,PtWd,Species
count,151.0,151.0,150.0,151.0,151.0,151
unique,,,,,,4
top,,,,,,Iris-versicolor
freq,,,,,,51
mean,76.0,6.196689,3.057333,3.764901,1.2,
std,43.734045,4.324325,0.432776,1.760197,0.760789,
min,1.0,4.3,2.0,1.0,0.1,
25%,38.5,5.1,2.8,1.6,0.3,
50%,76.0,5.8,3.0,4.4,1.3,
75%,113.5,6.4,3.3,5.1,1.8,


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Case ID  151 non-null    int64  
 1   SpLn     151 non-null    float64
 2   SpWd     150 non-null    float64
 3   PtLn     151 non-null    float64
 4   PtWd     151 non-null    float64
 5   Species  151 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


# index=True

![image.png](attachment:85a99f71-f2c7-4d12-b3cc-23c5d59cea79.png)

![image.png](attachment:230d109f-452f-4ec1-acb1-876ae51c8114.png)

# Detecting Missing Entry

In [11]:
# check if any value is missing

df.isnull().values.any()

True

In [12]:
# find the missing ones

missing = df.isnull()

missing

Unnamed: 0,Case ID,SpLn,SpWd,PtLn,PtWd,Species
0,False,False,False,False,False,False
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
...,...,...,...,...,...,...
146,False,False,False,False,False,False
147,False,False,False,False,False,False
148,False,False,False,False,False,False
149,False,False,False,False,False,False


In [13]:
# find missing data column-wise

for column in missing.columns.values.tolist():
    print(f'{missing[column].value_counts()} \n')

Case ID
False    151
Name: count, dtype: int64 

SpLn
False    151
Name: count, dtype: int64 

SpWd
False    150
True       1
Name: count, dtype: int64 

PtLn
False    151
Name: count, dtype: int64 

PtWd
False    151
Name: count, dtype: int64 

Species
False    151
Name: count, dtype: int64 



# Dropping Faulty Data

In [14]:
df.dropna(subset=['SpWd'], axis=0, inplace=True)
df.reset_index(drop=True, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   Case ID  150 non-null    int64  
 1   SpLn     150 non-null    float64
 2   SpWd     150 non-null    float64
 3   PtLn     150 non-null    float64
 4   PtWd     150 non-null    float64
 5   Species  150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [15]:
df.to_csv('iris_prepared.csv', index=False)

![image.png](attachment:5b400130-3a02-417c-8d93-301d93064747.png)

# Standardization and Normalization Lecture Note

https://drive.google.com/file/d/1kTcH3Zqyd7K-e-1yuVs-UIPry8wsTSMl/view?usp=drive_link

![ML Class 02 (Mar-11-2025)-2.jpg](attachment:6985703b-6c64-401d-b7d7-cfaef8c39b4b.jpg)

# Normalization

In [16]:
# columnwise normalization

df['SpLn'] = df['SpLn']/df['SpLn'].max()
df['SpWd'] = df['SpWd']/df['SpWd'].max()
df['PtLn'] = df['PtLn']/df['PtLn'].max()
df['PtWd'] = df['PtWd']/df['PtWd'].max()

df.head(15)

Unnamed: 0,Case ID,SpLn,SpWd,PtLn,PtWd,Species
0,1,0.087931,0.795455,0.202899,0.08,Iris-setosa
1,2,0.084483,0.681818,0.202899,0.08,Iris-setosa
2,3,0.081034,0.727273,0.188406,0.08,Iris-setosa
3,4,0.07931,0.704545,0.217391,0.08,Iris-setosa
4,5,0.086207,0.818182,0.202899,0.08,Iris-setosa
5,6,0.093103,0.886364,0.246377,0.16,Iris-setosa
6,7,0.07931,0.772727,0.202899,0.12,Iris-setosa
7,8,0.086207,0.772727,0.217391,0.08,Iris-setosa
8,9,0.075862,0.659091,0.202899,0.08,Iris-setosa
9,10,0.084483,0.704545,0.217391,0.04,Iris-setosa
