### Task 1: Handling Missing Values - Simple Imputation
**Description**: Given a dataset with missing values, impute the missing values using the mean for numerical features and the mode for categorical features.

In [1]:
# write your code from here
import pandas as pd
from sklearn.impute import SimpleImputer

def impute_missing_values(df):
    # Separate numerical and categorical columns
    num_cols = df.select_dtypes(include=['number']).columns
    cat_cols = df.select_dtypes(include=['object', 'category']).columns

    # Imputer for numerical columns (mean)
    num_imputer = SimpleImputer(strategy='mean')
    df[num_cols] = num_imputer.fit_transform(df[num_cols])

    # Imputer for categorical columns (mode)
    cat_imputer = SimpleImputer(strategy='most_frequent')
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols])

    return df

# Example usage
if __name__ == "__main__":
    # Sample dataset with missing values
    data = {
        'Age': [25, 30, None, 22, 28],
        'Income': [50000, None, 60000, 55000, None],
        'Gender': ['Male', None, 'Female', 'Female', 'Male'],
        'City': ['New York', 'Chicago', None, 'Chicago', 'New York']
    }
    df = pd.DataFrame(data)

    print("Before imputation:")
    print(df)

    df_imputed = impute_missing_values(df)

    print("\nAfter imputation:")
    print(df_imputed)


Before imputation:
    Age   Income  Gender      City
0  25.0  50000.0    Male  New York
1  30.0      NaN    None   Chicago
2   NaN  60000.0  Female      None
3  22.0  55000.0  Female   Chicago
4  28.0      NaN    Male  New York

After imputation:
     Age   Income  Gender      City
0  25.00  50000.0    Male  New York
1  30.00  55000.0    None   Chicago
2  26.25  60000.0  Female      None
3  22.00  55000.0  Female   Chicago
4  28.00  55000.0    Male  New York


### Task 2: Feature Scaling - Min-Max Normalization
**Description**: Normalize a numerical feature using Min-Max scaling to a range [0, 1].

In [2]:
# write your code from here
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

def min_max_normalize(df, columns):
    scaler = MinMaxScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Example usage
if __name__ == "__main__":
    # Sample dataset
    data = {
        'Age': [25, 30, 22, 45, 35],
        'Income': [50000, 60000, 55000, 80000, 75000]
    }
    df = pd.DataFrame(data)

    print("Before Min-Max normalization:")
    print(df)

    # Normalize both 'Age' and 'Income' columns
    df_scaled = min_max_normalize(df, ['Age', 'Income'])

    print("\nAfter Min-Max normalization:")
    print(df_scaled)


Before Min-Max normalization:
   Age  Income
0   25   50000
1   30   60000
2   22   55000
3   45   80000
4   35   75000

After Min-Max normalization:
        Age    Income
0  0.130435  0.000000
1  0.347826  0.333333
2  0.000000  0.166667
3  1.000000  1.000000
4  0.565217  0.833333


### Task 3: Handling Missing Values - Drop Missing Values
**Description**: Remove rows with missing values from a dataset.

In [1]:
# write your code from here
import pandas as pd

def drop_missing_rows(df):
    """
    Removes rows with any missing values from the DataFrame.
    """
    df_cleaned = df.dropna()
    return df_cleaned

# Example usage
if __name__ == "__main__":
    # Sample dataset with missing values
    data = {
        'Name': ['Alice', 'Bob', 'Charlie', None],
        'Age': [25, None, 30, 22],
        'City': ['New York', 'Los Angeles', None, 'Chicago']
    }

    df = pd.DataFrame(data)

    print("Before dropping missing values:")
    print(df)

    # Drop rows with any NaN values
    df_clean = drop_missing_rows(df)

    print("\nAfter dropping missing values:")
    print(df_clean)


Before dropping missing values:
      Name   Age         City
0    Alice  25.0     New York
1      Bob   NaN  Los Angeles
2  Charlie  30.0         None
3     None  22.0      Chicago

After dropping missing values:
    Name   Age      City
0  Alice  25.0  New York


### Task 4: Feature Scaling - Standardization
**Description**: Standardize a numerical feature to have zero mean and unit variance.

In [2]:
# write your code from here
import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardize_numerical_features(df, columns):
    """
    Standardizes the specified numerical columns to have zero mean and unit variance.
    """
    scaler = StandardScaler()
    df[columns] = scaler.fit_transform(df[columns])
    return df

# Example usage
if __name__ == "__main__":
    # Sample dataset
    data = {
        'Age': [25, 30, 22, 45, 35],
        'Income': [50000, 60000, 55000, 80000, 75000]
    }
    df = pd.DataFrame(data)

    print("Before standardization:")
    print(df)

    # Apply standardization
    df_standardized = standardize_numerical_features(df, ['Age', 'Income'])

    print("\nAfter standardization:")
    print(df_standardized)


Before standardization:
   Age  Income
0   25   50000
1   30   60000
2   22   55000
3   45   80000
4   35   75000

After standardization:
        Age    Income
0 -0.788742 -1.209416
1 -0.172537 -0.345547
2 -1.158465 -0.777482
3  1.676077  1.382189
4  0.443667  0.950255
