# Assignment 01: 
1. Load the Iris dataset into a pandas DataFrame
2. Find the mean and median of the 'sepal_length' column.
3. Calculate the 75th percentile of the 'petal_width' column for each species in the Iris dataset.
4. Create a new column in the Iris DataFrame called 'sepal_area', which is the product of 'sepal_length' and 'sepal_width'.
5. Remove all rows in the Iris DataFrame where 'petal_length' is greater than twice the standard deviation of 'petal_length' for that species.
6. Normalize all numerical columns in the Iris DataFrame (except the 'species' column) using Min-Max scaling.
7. Find the three most common combinations of 'sepal_length', 'sepal_width', and 'petal_length' in the Iris dataset.
8. Group the Iris DataFrame by 'species' and find the row with the highest 'sepal_width' for each group.
9. Replace all negative values in the 'petal_width' column of the Iris DataFrame with the mean of the non-negative values in that column.
10. Calculate the correlation matrix for the 'sepal_length', 'sepal_width', 'petal_length', and 'petal_width' columns in the Iris dataset and find the feature with the highest absolute correlation with 'petal_width'.


### Load the Iris dataset into a pandas DataFrame

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv("iris.csv")

In [3]:
df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


### Find the mean and median of the 'sepal_length' column.

In [5]:
mean_sepal_length = df['sepal_length'].mean()

In [6]:
mean_sepal_length

5.843333333333334

### Calculate the 75th percentile of the 'petal_width' column for each species in the Iris dataset.

In [20]:
petal_width_per= df.groupby('species')['petal_width'].quantile(0.75)

In [21]:
petal_width_per

species
setosa        0.3
versicolor    1.5
virginica     2.3
Name: petal_width, dtype: float64

### Create a new column in the Iris DataFrame called 'sepal_area', which is the product of 'sepal_length' and 'sepal_width'.

In [23]:
df['sepal_area'] = df['sepal_length']*df['sepal_width']

In [24]:
df['sepal_area']

0      17.85
1      14.70
2      15.04
3      14.26
4      18.00
       ...  
145    20.10
146    15.75
147    19.50
148    21.08
149    17.70
Name: sepal_area, Length: 150, dtype: float64

In [25]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area
0,5.1,3.5,1.4,0.2,setosa,17.85
1,4.9,3.0,1.4,0.2,setosa,14.7
2,4.7,3.2,1.3,0.2,setosa,15.04
3,4.6,3.1,1.5,0.2,setosa,14.26
4,5.0,3.6,1.4,0.2,setosa,18.0


### Remove all rows in the Iris DataFrame where 'petal_length' is greater than twice the standard deviation of 'petal_length' for that species.

In [28]:
std_dev = df.groupby('species')['petal_length'].transform('std')

In [29]:
std_dev

0      0.173511
1      0.173511
2      0.173511
3      0.173511
4      0.173511
         ...   
145    0.551895
146    0.551895
147    0.551895
148    0.551895
149    0.551895
Name: petal_length, Length: 150, dtype: float64

In [36]:
filter_df = df[df['petal_length'] <= 2* std_dev]

In [37]:
filter_df.reset_index(drop=True)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area


### Normalize all numerical columns in the Iris DataFrame (except the 'species' column) using Min-Max scaling.

In [39]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area
0,5.1,3.5,1.4,0.2,setosa,17.85
1,4.9,3.0,1.4,0.2,setosa,14.7
2,4.7,3.2,1.3,0.2,setosa,15.04
3,4.6,3.1,1.5,0.2,setosa,14.26
4,5.0,3.6,1.4,0.2,setosa,18.0


In [43]:
numerical_columns = df.select_dtypes(include = ['int64' , 'float64']).columns.difference(['species'])

In [44]:
numerical_columns

Index(['petal_length', 'petal_width', 'sepal_area', 'sepal_length',
       'sepal_width'],
      dtype='object')

In [45]:
from sklearn.preprocessing import MinMaxScaler

In [46]:
scalar = MinMaxScaler()

In [50]:
df[numerical_columns] = scalar.fit_transform(df[numerical_columns])

In [53]:
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area
0,0.222222,0.625,0.067797,0.041667,setosa,0.392108
1,0.166667,0.416667,0.067797,0.041667,setosa,0.234765
2,0.111111,0.5,0.050847,0.041667,setosa,0.251748
3,0.083333,0.458333,0.084746,0.041667,setosa,0.212787
4,0.194444,0.666667,0.067797,0.041667,setosa,0.3996


### Find the three most common combinations of 'sepal_length', 'sepal_width', and 'petal_length' in the Iris dataset.

In [55]:
most_common_combinaitons = df.groupby(['sepal_length','sepal_width','petal_length']).size().nlargest(3)

In [56]:
most_common_combinaitons

sepal_length  sepal_width  petal_length
0.166667      0.458333     0.084746        3
0.138889      0.416667     0.067797        2
0.222222      0.625000     0.067797        2
dtype: int64

### Group the Iris DataFrame by 'species' and find the row with the highest 'sepal_width' for each group.

In [59]:
highest_sepal_width_rows = df.loc[df.groupby('species')['sepal_width'].idxmax()]

In [60]:
highest_sepal_width_rows

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species,sepal_area
15,0.388889,1.0,0.084746,0.125,setosa,0.753247
85,0.472222,0.583333,0.59322,0.625,versicolor,0.519481
117,0.944444,0.75,0.966102,0.875,virginica,0.962038


### Replace all negative values in the 'petal_width' column of the Iris DataFrame with the mean of the non-negative values in that column.

In [62]:
non_negative_mean = df.loc[df['petal_width'] >0 , 'petal_width'].mean()

In [63]:
non_negative_mean 

0.47685185185185186

In [65]:
df['petal_width'] = df['petal_width'].apply(lambda x : non_negative_mean if x < 0 else x)

### Calculate the correlation matrix for the 'sepal_length', 'sepal_width', 'petal_length', and 'petal_width' columns in the Iris dataset and find the feature with the highest absolute correlation with 'petal_width'.

In [68]:
correlation_matrix = df[['sepal_length', 'sepal_width', 'petal_length','petal_width' ]].corr()

In [69]:
correlation_matrix 

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width
sepal_length,1.0,-0.109369,0.871754,0.817954
sepal_width,-0.109369,1.0,-0.420516,-0.356544
petal_length,0.871754,-0.420516,1.0,0.962757
petal_width,0.817954,-0.356544,0.962757,1.0


In [70]:
highest_abs_correlation = correlation_matrix.abs().idxmax()

In [71]:
highest_abs_correlation 

sepal_length    sepal_length
sepal_width      sepal_width
petal_length    petal_length
petal_width      petal_width
dtype: object