In [None]:
import numpy as np
import pandas as pd

In [None]:
df_iris = pd.read_csv('iris.csv')
df_amazon = pd.read_csv('amazon.csv')

In [None]:
df_iris.head()


In [None]:
# df_iris.info()
# df_iris.describe()
df_iris.sample(5)


In [None]:
df_iris['variety'].value_counts()

In [None]:
df_iris.isnull().sum()

In [None]:
df_iris['sepal.width'].fillna(df_iris['sepal.width'].mean(),inplace=True);
df_iris['sepal.length'].fillna(df_iris['sepal.length'].mean(),inplace=True);
df_iris['petal.length'].fillna(df_iris['petal.length'].mean(),inplace=True);
df_iris['petal.width'].fillna(df_iris['petal.width'].mean(),inplace=True);
df_iris['variety'].fillna(df_iris['variety'].mode()[0],inplace=True);

In [None]:
df_iris.isnull().sum()

In [None]:
df_iris.dtypes

In [None]:
# Convert variety to categorical codes
df_iris['variety'] = df_iris['variety'].astype('category')
# print(df_iris['variety'].astype('category').drop_duplicates())
df_iris['variety_encoded'] = df_iris['variety'].cat.codes

# Optional: check mapping
print(df_iris[['variety', 'variety_encoded']].drop_duplicates())


In [None]:
from sklearn.preprocessing import OrdinalEncoder

# Create encoder
enc = OrdinalEncoder()

# Fit and transform the 'variety' column
df_iris[['variety_encoded']] = enc.fit_transform(df_iris[['variety']])

# Optional: convert to category
df_iris['variety_encoded'] = df_iris['variety_encoded'].astype('category')

print(df_iris[['variety', 'variety_encoded']].drop_duplicates())


In [None]:
df_iris.sample(10)

In [None]:
from sklearn.preprocessing import MaxAbsScaler

# Min-Max Scaling for 'sepal.length'
df_iris['sepal.length_minmax'] = (
    (df_iris['sepal.length'] - df_iris['sepal.length'].min()) /
    (df_iris['sepal.length'].max() - df_iris['sepal.length'].min())
)

# MaxAbs Scaling for 'sepal.length'
# abs_scaler = MaxAbsScaler()
df_iris['sepal.length_maxabs'] = MaxAbsScaler().fit_transform(df_iris[['sepal.length']])


In [None]:
df_iris.head()

In [None]:
from sklearn.preprocessing import MaxAbsScaler

# -------------------------------
# Min-Max Normalization (Manual)
# -------------------------------
# Apply on all numerical columns
numerical_cols = ['sepal.length', 'sepal.width', 'petal.length', 'petal.width']

# Apply Min-Max scaling
for col in numerical_cols:
    df_iris[col + '_minmax'] = (df_iris[col] - df_iris[col].min()) / (df_iris[col].max() - df_iris[col].min())

# -------------------------------
# MaxAbsScaler (Sklearn)
# -------------------------------
scaler = MaxAbsScaler()
df_iris_maxabs = scaler.fit_transform(df_iris[numerical_cols])

# Add back to the dataframe
for i, col in enumerate(numerical_cols):
    df_iris[col + '_maxabs'] = df_iris_maxabs[:, i]

# -------------------------------
# Display the results
# -------------------------------
print(df_iris[[col + '_minmax' for col in numerical_cols]].head())
print(df_iris[[col + '_maxabs' for col in numerical_cols]].head())


In [None]:
df_iris.head()

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
df_iris.columns = df_iris.columns.str.replace(' (cm)', '', regex=False)
df_iris.rename(columns={
    'sepal length': 'sepal_length',
    'sepal width': 'sepal_width',
    'petal length': 'petal_length',
    'petal width': 'petal_width'
}, inplace=True)

In [None]:
# --------------------------
# Step 2: Plot Before Removing Outliers
# --------------------------
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
fig.suptitle('Before removing Outliers', fontsize=16)

sns.boxplot(data=df_iris, x='sepal_length', ax=axes[0, 0])
sns.boxplot(data=df_iris, x='sepal_width', ax=axes[0, 1])
sns.boxplot(data=df_iris, x='petal_length', ax=axes[1, 0])
sns.boxplot(data=df_iris, x='petal_width', ax=axes[1, 1])

plt.tight_layout()
plt.show()

# --------------------------
# Step 3: Outlier Detection and Removal Function
# --------------------------
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR

    outliers = df[(df[column] < lower) | (df[column] > upper)]
    print(f"\n{column.upper()} - Lower bound: {lower}, Upper bound: {upper}")
    print(f"Outliers found in {column}: {len(outliers)}")

    return df[(df[column] >= lower) & (df[column] <= upper)]

# --------------------------
# Step 4: Remove Outliers From All Numeric Columns
# --------------------------
df_cleaned = df_iris.copy()
for col in ['sepal_length', 'sepal_width', 'petal_length', 'petal_width']:
    df_cleaned = remove_outliers(df_cleaned, col)

# --------------------------
# Step 5: Plot After Removing Outliers
# --------------------------
fig, axes = plt.subplots(2, 2)
fig.suptitle('After removing Outliers')

sns.boxplot(data=df_cleaned, x='sepal_length', ax=axes[0, 0])
sns.boxplot(data=df_cleaned, x='sepal_width', ax=axes[0, 1])
sns.boxplot(data=df_cleaned, x='petal_length', ax=axes[1, 0])
sns.boxplot(data=df_cleaned, x='petal_width', ax=axes[1, 1])

plt.tight_layout()
plt.show()


In [None]:
df_iris.sample(5)

In [None]:
print('sepal_length:', df_iris['sepal.length'].skew())
print('sepal_width:', df_iris['sepal.width'].skew())
print('petal_length:', df_iris['petal.length'].skew())
print('petal_width:', df_iris['petal.width'].skew())

In [None]:
fig, axes = plt.subplots(2,2)
fig.suptitle('Handling Data Skewness')
sns.histplot(ax = axes[0,0], data = df_iris['sepal.length'], kde=True)
sns.histplot(ax = axes[0,1], data = df_iris['sepal.width'], kde=True)
sns.histplot(ax = axes[1,0], data = df_iris['petal.length'], kde=True)
sns.histplot(ax = axes[1,1], data = df_iris['petal.width'], kde=True)

In [None]:
df_iris.head()

In [None]:
from sklearn.preprocessing import PowerTransformer
yeo = PowerTransformer(standardize=True)
df_iris['sepal.length'] = yeo.fit_transform(df_iris['sepal.length'].values.reshape(-1,1))
df_iris['sepal.width'] = yeo.fit_transform(df_iris['sepal.width'].values.reshape(-1,1))

In [None]:
df_iris.head()

In [None]:
fig, axes = plt.subplots(2,2)
fig.suptitle('Handling Data Skewness')
sns.histplot(ax = axes[0,0], data = df_iris['sepal.length'], kde=True)
sns.histplot(ax = axes[0,1], data = df_iris['sepal.width'], kde=True)
sns.histplot(ax = axes[1,0], data = df_iris['petal.length'], kde=True)
sns.histplot(ax = axes[1,1], data = df_iris['petal.width'], kde=True)