##Importing Libraries & Loading Data


In [4]:
import pandas as pd
#Load the Iris dataset
df=pd.read_csv("Iris.csv")
#display the first 5 rows
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1.0,5.1,3.5,1.4,0.2,Iris-setosa
1,0.9,3.0,1.4,0.2,Iris-setosa,
2,3.0,4.7,3.2,1.3,0.2,Iris-setosa
3,4.0,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,5.0,3.6,,0.2,Iris-setosa


##Basic Data Inspection

In [5]:
#Get a quick summary of the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    float64
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  148 non-null    float64
 4   PetalWidthCm   150 non-null    object 
 5   Species        149 non-null    object 
dtypes: float64(4), object(2)
memory usage: 7.2+ KB


In [6]:
# Get statistical summary for numerical columns
df.describe()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm
count,150.0,150.0,150.0,148.0
mean,75.492667,5.830667,3.043333,3.781757
std,43.457949,0.856632,0.454126,1.769393
min,0.9,3.0,1.4,0.2
25%,38.25,5.1,2.8,1.6
50%,75.5,5.8,3.0,4.4
75%,112.75,6.4,3.3,5.1
max,150.0,7.9,4.4,6.9


In [7]:
#Check for missing values
df.isnull().sum()

Unnamed: 0,0
Id,0
SepalLengthCm,0
SepalWidthCm,0
PetalLengthCm,2
PetalWidthCm,0
Species,1


In [8]:
# List all unique species
df['Species'].unique()

array(['Iris-setosa', nan, 'Iris-versicolor', 'Iris-virginica'],
      dtype=object)

In [9]:
# Count how many samples per species
df['Species'].value_counts()

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
Iris-versicolor,50
Iris-virginica,50
Iris-setosa,49


##Data Cleaning

In [11]:
# Option 1: Fill missing values with the mean of the column
df['PetalLengthCm'].fillna(df['PetalLengthCm'].mean(), inplace=True)
#Verify no more missing values
print(df.isnull().sum())

Id               0
SepalLengthCm    0
SepalWidthCm     0
PetalLengthCm    0
PetalWidthCm     0
Species          1
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['PetalLengthCm'].fillna(df['PetalLengthCm'].mean(), inplace=True)


##Selecting & Filtering Data

In [15]:
#Select a single column (returns a Series)
sepal_lengths = df['SepalLengthCm']
print(sepal_lengths)


0      5.1
1      3.0
2      4.7
3      4.6
4      5.0
      ... 
145    6.7
146    6.3
147    6.5
148    6.2
149    5.9
Name: SepalLengthCm, Length: 150, dtype: float64


In [16]:
# Select multiple columns (returns a DataFrame)
measurements = df[['SepalLengthCm', 'SepalWidthCm', 'PetalLengthCm', 'PetalWidthCm']]
print(measurements)


     SepalLengthCm  SepalWidthCm  PetalLengthCm PetalWidthCm
0              5.1           3.5       1.400000          0.2
1              3.0           1.4       0.200000  Iris-setosa
2              4.7           3.2       1.300000          0.2
3              4.6           3.1       1.500000          0.2
4              5.0           3.6       3.781757          0.2
..             ...           ...            ...          ...
145            6.7           3.0       5.200000          2.3
146            6.3           2.5       5.000000          1.9
147            6.5           3.0       5.200000          2.0
148            6.2           3.4       5.400000          2.3
149            5.9           3.0       5.100000          1.8

[150 rows x 4 columns]


In [17]:
# Filter rows: Get all Iris-setosa samples
setosa = df[df['Species'] == 'Iris-setosa']
print(setosa)


      Id  SepalLengthCm  SepalWidthCm  PetalLengthCm PetalWidthCm      Species
0    1.0            5.1           3.5       1.400000          0.2  Iris-setosa
2    3.0            4.7           3.2       1.300000          0.2  Iris-setosa
3    4.0            4.6           3.1       1.500000          0.2  Iris-setosa
4    5.0            5.0           3.6       3.781757          0.2  Iris-setosa
5    6.0            5.4           3.9       1.700000          0.4  Iris-setosa
6    7.0            4.6           3.4       1.400000          0.3  Iris-setosa
7    8.0            5.0           3.4       1.500000          0.2  Iris-setosa
8    9.0            4.4           2.9       1.400000          0.2  Iris-setosa
9   10.0            4.9           3.1       1.500000          0.1  Iris-setosa
10  11.0            5.4           3.7       3.781757          0.2  Iris-setosa
11  12.0            4.8           3.4       1.600000          0.2  Iris-setosa
12  13.0            4.8           3.0       1.400000

In [18]:
# Filter rows with multiple conditions: Long and wide sepals
big_flowers = df[(df['SepalLengthCm'] > 6.5) & (df['SepalWidthCm'] > 3.2)]
print(big_flowers)


        Id  SepalLengthCm  SepalWidthCm  PetalLengthCm PetalWidthCm  \
109  110.0            7.2           3.6            6.1          2.5   
117  118.0            7.7           3.8            6.7          2.2   
124  125.0            6.7           3.3            5.7          2.1   
131  132.0            7.9           3.8            6.4          2.0   
144  145.0            6.7           3.3            5.7          2.5   

            Species  
109  Iris-virginica  
117  Iris-virginica  
124  Iris-virginica  
131  Iris-virginica  
144  Iris-virginica  


In [19]:
# Use query() for cleaner syntax
versicolor = df.query('Species == "Iris-versicolor"')
print(versicolor)

       Id  SepalLengthCm  SepalWidthCm  PetalLengthCm PetalWidthCm  \
50   51.0            7.0           3.2            4.7          1.4   
51   52.0            6.4           3.2            4.5          1.5   
52   53.0            6.9           3.1            4.9          1.5   
53   54.0            5.5           2.3            4.0          1.3   
54   55.0            6.5           2.8            4.6          1.5   
55   56.0            5.7           2.8            4.5          1.3   
56   57.0            6.3           3.3            4.7          1.6   
57   58.0            4.9           2.4            3.3          1.0   
58   59.0            6.6           2.9            4.6          1.3   
59   60.0            5.2           2.7            3.9          1.4   
60   61.0            5.0           2.0            3.5          1.0   
61   62.0            5.9           3.0            4.2          1.5   
62   63.0            6.0           2.2            4.0          1.0   
63   64.0           

##Grouping & Aggregation

In [20]:
# Group by species and calculate mean for all numerical columns
species_summary = df.groupby('Species').mean(numeric_only=True)
print(species_summary)

                         Id  SepalLengthCm  SepalWidthCm  PetalLengthCm
Species                                                                
Iris-setosa       25.979592       5.008163      3.426531        1.56048
Iris-versicolor   75.500000       5.936000      2.770000        4.26000
Iris-virginica   125.500000       6.588000      2.974000        5.55200


In [21]:
# Get more detailed stats: min, max, mean for sepal length by species
detailed_stats = df.groupby('Species')['SepalLengthCm'].agg(['min', 'max', 'mean'])
print(detailed_stats)

                 min  max      mean
Species                            
Iris-setosa      4.3  5.8  5.008163
Iris-versicolor  4.9  7.0  5.936000
Iris-virginica   4.9  7.9  6.588000


In [22]:
# Count the number of samples per species (again, for clarity)
df.groupby('Species').size()

Unnamed: 0_level_0,0
Species,Unnamed: 1_level_1
Iris-setosa,49
Iris-versicolor,50
Iris-virginica,50


##Sorting Data

In [24]:
# Sort by Sepal Length (ascending)
df_sorted = df.sort_values('SepalLengthCm')
print(df_sorted)


        Id  SepalLengthCm  SepalWidthCm  PetalLengthCm PetalWidthCm  \
1      0.9            3.0           1.4            0.2  Iris-setosa   
13    14.0            4.3           3.0            1.1          0.1   
8      9.0            4.4           2.9            1.4          0.2   
38    39.0            4.4           3.0            1.3          0.2   
42    43.0            4.4           3.2            1.3          0.2   
..     ...            ...           ...            ...          ...   
122  123.0            7.7           2.8            6.7          2.0   
117  118.0            7.7           3.8            6.7          2.2   
118  119.0            7.7           2.6            6.9          2.3   
135  136.0            7.7           3.0            6.1          2.3   
131  132.0            7.9           3.8            6.4          2.0   

            Species  
1               NaN  
13      Iris-setosa  
8       Iris-setosa  
38      Iris-setosa  
42      Iris-setosa  
..             

In [25]:
# Sort by Species, then by Petal Length (descending)
df_sorted = df.sort_values(['Species', 'PetalLengthCm'], ascending=[True, False])
print(df_sorted)

        Id  SepalLengthCm  SepalWidthCm  PetalLengthCm PetalWidthCm  \
4      5.0            5.0           3.6       3.781757          0.2   
10    11.0            5.4           3.7       3.781757          0.2   
24    25.0            4.8           3.4       1.900000          0.2   
44    45.0            5.1           3.8       1.900000          0.4   
5      6.0            5.4           3.9       1.700000          0.4   
..     ...            ...           ...            ...          ...   
127  128.0            6.1           3.0       4.900000          1.8   
126  127.0            6.2           2.8       4.800000          1.8   
138  139.0            6.0           3.0       4.800000          1.8   
106  107.0            4.9           2.5       4.500000          1.7   
1      0.9            3.0           1.4       0.200000  Iris-setosa   

            Species  
4       Iris-setosa  
10      Iris-setosa  
24      Iris-setosa  
44      Iris-setosa  
5       Iris-setosa  
..             

##Advanced Analysis & Visualization Prep

In [31]:
# Make sure missing values are handled (fill NaN in PetalLengthCm)
df['PetalLengthCm'].fillna(df['PetalLengthCm'].mean(), inplace=True)

# Create PetalSize column based on PetalLengthCm
df['PetalSize'] = df['PetalLengthCm'].apply(
    lambda x: 'Small' if x < 3.0 else ('Medium' if x < 5.0 else 'Large')
)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['PetalLengthCm'].fillna(df['PetalLengthCm'].mean(), inplace=True)


##Exporting your Cleaned?Analyzed Data

In [30]:
# Save the cleaned DataFrame to a new CSV file
df.to_csv('Iris_Cleaned.csv', index=False)
