In [43]:
# Cell 1: Importing necessary libraries
import pandas as pd  # Used for data manipulation and analysis
import numpy as np  # Used for numerical operations
import matplotlib.pyplot as plt  # Used for data visualization
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import MinMaxScaler

In [2]:
df=pd.read_csv("Iris.csv") # Reads the Iris dataset into a pandas DataFrame

In [3]:
df.head() # Shows the first 5 rows by default

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df.head(2) # Shows the first 2 rows

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa


In [5]:
df.tail() # Shows the last 5 rows by default

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
145,146,6.7,3.0,5.2,2.3,Iris-virginica
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [6]:
df.tail(4)  # Shows the last 4 rows

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
146,147,6.3,2.5,5.0,1.9,Iris-virginica
147,148,6.5,3.0,5.2,2.0,Iris-virginica
148,149,6.2,3.4,5.4,2.3,Iris-virginica
149,150,5.9,3.0,5.1,1.8,Iris-virginica


In [7]:
df.columns

Index(['Id', 'SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm',
       'Species'],
      dtype='object')

In [8]:
df.dtypes  # Returns the data type of each column

Id                 int64
SepalLengthCm    float64
SepalWidthCm     float64
PetalLengthCm    float64
PetalWidthCm     float64
Species           object
dtype: object

In [9]:
df["Species"].dtypes  # Returns the data type of the 'Species' column

dtype('O')

In [10]:
df['Species'].unique() # Returns the unique values in the 'Species' column

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [11]:
df['Species'].value_counts()


Species
Iris-setosa        50
Iris-versicolor    50
Iris-virginica     50
Name: count, dtype: int64

In [None]:
# Data Preprocessing

In [12]:
df.describe()  # Returns descriptive statistics like mean, std, min, max, etc.

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm
count,150.0,150.0,150.0,150.0,150.0
mean,75.5,5.843333,3.054,3.758667,1.198667
std,43.445368,0.828066,0.433594,1.76442,0.763161
min,1.0,4.3,2.0,1.0,0.1
25%,38.25,5.1,2.8,1.6,0.3
50%,75.5,5.8,3.0,4.35,1.3
75%,112.75,6.4,3.3,5.1,1.8
max,150.0,7.9,4.4,6.9,2.5


In [13]:
df[50:125]

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
50,51,7.0,3.2,4.7,1.4,Iris-versicolor
51,52,6.4,3.2,4.5,1.5,Iris-versicolor
52,53,6.9,3.1,4.9,1.5,Iris-versicolor
53,54,5.5,2.3,4.0,1.3,Iris-versicolor
54,55,6.5,2.8,4.6,1.5,Iris-versicolor
...,...,...,...,...,...,...
120,121,6.9,3.2,5.7,2.3,Iris-virginica
121,122,5.6,2.8,4.9,2.0,Iris-virginica
122,123,7.7,2.8,6.7,2.0,Iris-virginica
123,124,6.3,2.7,4.9,1.8,Iris-virginica


In [14]:
df.describe(include='all')

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
count,150.0,150.0,150.0,150.0,150.0,150
unique,,,,,,3
top,,,,,,Iris-setosa
freq,,,,,,50
mean,75.5,5.843333,3.054,3.758667,1.198667,
std,43.445368,0.828066,0.433594,1.76442,0.763161,
min,1.0,4.3,2.0,1.0,0.1,
25%,38.25,5.1,2.8,1.6,0.3,
50%,75.5,5.8,3.0,4.35,1.3,
75%,112.75,6.4,3.3,5.1,1.8,


In [15]:
df['SepalLengthCm'].median()

5.8

In [16]:
df['SepalLengthCm'].mean()

5.843333333333334

In [17]:
df['Species'].isna().sum() # Calculates and displays the number of missing values in 'Species' column

0

In [18]:
df['SepalWidthCm'].notnull().sum()

150

In [19]:
df.isna().sum() # Calculates and displays the number of missing values in each column

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [20]:
df.describe(include='object')

Unnamed: 0,Species
count,150
unique,3
top,Iris-setosa
freq,50


In [22]:
df.shape 

(150, 6)

In [23]:
df.iloc[125:139,1:3]

Unnamed: 0,SepalLengthCm,SepalWidthCm
125,7.2,3.2
126,6.2,2.8
127,6.1,3.0
128,6.4,2.8
129,7.2,3.0
130,7.4,2.8
131,7.9,3.8
132,6.4,2.8
133,6.3,2.8
134,6.1,2.6


In [24]:
df.isna().sum() #Displaying the count of not available values

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [25]:
df['PetalLengthCm']=df['PetalLengthCm'].fillna(df['PetalLengthCm'].mean()) #Finding the mean of the present values and filling the empty rows.

In [26]:
df['SepalLengthCm']=df['SepalLengthCm'].fillna(df['SepalLengthCm'].mean())

In [27]:
df['SepalWidthCm']=df['SepalWidthCm'].fillna(df['SepalWidthCm'].mean())

In [28]:
df.iloc[101:115,1:4] #Displaying a subset of the DataFrame

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm
101,5.8,2.7,5.1
102,7.1,3.0,5.9
103,6.3,2.9,5.6
104,6.5,3.0,5.8
105,7.6,3.0,6.6
106,4.9,2.5,4.5
107,7.3,2.9,6.3
108,6.7,2.5,5.8
109,7.2,3.6,6.1
110,6.5,3.2,5.1


In [29]:
df.dropna(subset=['PetalWidthCm'], inplace=True)  # Removing rows with missing values in 'PetalWidthCm'

In [30]:
df[95:110] #Displaying the subset

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
95,96,5.7,3.0,4.2,1.2,Iris-versicolor
96,97,5.7,2.9,4.2,1.3,Iris-versicolor
97,98,6.2,2.9,4.3,1.3,Iris-versicolor
98,99,5.1,2.5,3.0,1.1,Iris-versicolor
99,100,5.7,2.8,4.1,1.3,Iris-versicolor
100,101,6.3,3.3,6.0,2.5,Iris-virginica
101,102,5.8,2.7,5.1,1.9,Iris-virginica
102,103,7.1,3.0,5.9,2.1,Iris-virginica
103,104,6.3,2.9,5.6,1.8,Iris-virginica
104,105,6.5,3.0,5.8,2.2,Iris-virginica


In [31]:
df.isna().sum() #Checking for missing values after dropping rows

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          0
dtype: int64

In [32]:
df['Species']=df['Species'].map({'Iris-setosa':0,'Iris-versicolor':1,'Iris-virginica':2}) # Converting categorical values to numerical values for 'Species' column

In [33]:
df['Species'].unique() # Displaying the unique values in the 'Species' column after mapping

array([0, 1, 2], dtype=int64)

In [34]:
from sklearn.preprocessing import LabelEncoder # Importing LabelEncoder from scikit-learn

In [35]:
Label_Encode=LabelEncoder() #Creating a LabelEncoder object

In [36]:
df1=pd.read_csv("Iris.csv") #Reading the Iris dataset into a new DataFrame

In [37]:
df1['Species']=Label_Encode.fit_transform(df1['Species']) #Applying Label Encoding to the 'Species' column in df1

In [38]:
df1['Species'].unique() #Displaying the unique values in the 'Species' column in df1 after Label Encoding

array([0, 1, 2])

In [None]:
# Data Normaliztion

In [39]:
from sklearn import preprocessing # Importing preprocessing module from scikit-learn

In [40]:
preprocessor=preprocessing.MinMaxScaler() #Creating a MinMaxScaler object for data normalization

In [41]:
df['SepalLengthCm']=preprocessor.fit_transform(df[['SepalLengthCm']]) #Applying MinMax scaling using the fit_transform method

In [42]:
df['SepalLengthCm'].unique()  #Displaying the unique values in the 'SepalLengthCm' column after scaling

array([0.22222222, 0.16666667, 0.11111111, 0.08333333, 0.19444444,
       0.30555556, 0.02777778, 0.13888889, 0.        , 0.41666667,
       0.38888889, 0.25      , 0.33333333, 0.05555556, 0.27777778,
       0.75      , 0.58333333, 0.72222222, 0.61111111, 0.55555556,
       0.63888889, 0.44444444, 0.47222222, 0.5       , 0.36111111,
       0.66666667, 0.52777778, 0.69444444, 0.77777778, 0.91666667,
       0.83333333, 0.80555556, 0.94444444, 0.86111111, 1.        ])