In [1]:
import pandas as pd
import numpy as np

def advanced_data_cleaning(df):
    """
    Handles multiple data quality issues in a pandas DataFrame, including:
    - Filling missing values (mean for numeric, mode for categorical).
    - Removing duplicate rows.
    - Handling outliers in numeric columns (IQR method, replace with median).

    Args:
        df: A pandas DataFrame with potential data quality issues.

    Returns:
        A cleaned pandas DataFrame.
    """
    df_cleaned = df.copy()

    # 1. Handle Missing Values
    for col in df_cleaned.columns:
        if df_cleaned[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df_cleaned[col]):
                mean_val = df_cleaned[col].mean()
                df_cleaned[col].fillna(mean_val, inplace=True)
            else:
                mode_val = df_cleaned[col].mode()[0]
                df_cleaned[col].fillna(mode_val, inplace=True)

    # 2. Remove Duplicate Rows
    df_cleaned.drop_duplicates(inplace=True)
    df_cleaned.reset_index(drop=True, inplace=True) # Reset index after removing duplicates

    # 3. Handle Outliers (IQR Method)
    for col in df_cleaned.select_dtypes(include=np.number).columns:
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        median_value = df_cleaned[col].median()

        outliers = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)
        df_cleaned.loc[outliers, col] = median_value

    return df_cleaned

# Example Usage:
data = {'numerical_col': [10, 12, 15, np.nan, 18, 20, 12, 100, 15],
        'categorical_col': ['A', 'B', 'A', 'C', 'B', 'A', 'B', 'A', np.nan],
        'id': [1, 2, 3, 4, 5, 6, 2, 7, 8]}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

df_cleaned = advanced_data_cleaning(df)
print("\nCleaned DataFrame:")
print(df_cleaned)

Original DataFrame:
   numerical_col categorical_col  id
0           10.0               A   1
1           12.0               B   2
2           15.0               A   3
3            NaN               C   4
4           18.0               B   5
5           20.0               A   6
6           12.0               B   2
7          100.0               A   7
8           15.0             NaN   8

Cleaned DataFrame:
   numerical_col categorical_col   id
0          10.00               A  1.0
1          12.00               B  2.0
2          15.00               A  3.0
3          25.25               C  4.0
4          18.00               B  5.0
5          20.00               A  6.0
6          16.50               A  7.0
7          15.00               A  8.0


  df_cleaned.loc[outliers, col] = median_value


In [None]:
# Question: Data Transformation Techniques
# Objective: Transform skewed data using log transformation.
# Description: Perform a log transformation to handle skewness in a dataset, which is particularly useful for
# certain machine learning models.



In [2]:
import pandas as pd
import numpy as np

def advanced_data_cleaning(df):
    """
    Handles multiple data quality issues in a pandas DataFrame, including:
    - Filling missing values (mean for numeric, mode for categorical).
    - Removing duplicate rows.
    - Handling outliers in numeric columns (IQR method, replace with median).

    Args:
        df: A pandas DataFrame with potential data quality issues.

    Returns:
        A cleaned pandas DataFrame.
    """
    df_cleaned = df.copy()

    # 1. Handle Missing Values
    for col in df_cleaned.columns:
        if df_cleaned[col].isnull().any():
            if pd.api.types.is_numeric_dtype(df_cleaned[col]):
                mean_val = df_cleaned[col].mean()
                df_cleaned[col].fillna(mean_val, inplace=True)
            else:
                mode_val = df_cleaned[col].mode()[0]
                df_cleaned[col].fillna(mode_val, inplace=True)

    # 2. Remove Duplicate Rows
    df_cleaned.drop_duplicates(inplace=True)
    df_cleaned.reset_index(drop=True, inplace=True) # Reset index after removing duplicates

    # 3. Handle Outliers (IQR Method)
    for col in df_cleaned.select_dtypes(include=np.number).columns:
        Q1 = df_cleaned[col].quantile(0.25)
        Q3 = df_cleaned[col].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - 1.5 * IQR
        upper_bound = Q3 + 1.5 * IQR
        median_value = df_cleaned[col].median()

        outliers = (df_cleaned[col] < lower_bound) | (df_cleaned[col] > upper_bound)
        df_cleaned.loc[outliers, col] = median_value

    return df_cleaned

# Example Usage:
data = {'numerical_col': [10, 12, 15, np.nan, 18, 20, 12, 100, 15],
        'categorical_col': ['A', 'B', 'A', 'C', 'B', 'A', 'B', 'A', np.nan],
        'id': [1, 2, 3, 4, 5, 6, 2, 7, 8]}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

df_cleaned = advanced_data_cleaning(df)
print("\nCleaned DataFrame:")
print(df_cleaned)

Original DataFrame:
   numerical_col categorical_col  id
0           10.0               A   1
1           12.0               B   2
2           15.0               A   3
3            NaN               C   4
4           18.0               B   5
5           20.0               A   6
6           12.0               B   2
7          100.0               A   7
8           15.0             NaN   8

Cleaned DataFrame:
   numerical_col categorical_col   id
0          10.00               A  1.0
1          12.00               B  2.0
2          15.00               A  3.0
3          25.25               C  4.0
4          18.00               B  5.0
5          20.00               A  6.0
6          16.50               A  7.0
7          15.00               A  8.0


  df_cleaned.loc[outliers, col] = median_value


In [3]:
import pandas as pd
import numpy as np
from scipy import stats

def handle_outliers_zscore(df, threshold=3):
    """
    Identifies and handles outliers in a pandas DataFrame using the Z-score method.
    Outliers are defined as values whose absolute Z-score exceeds the given threshold.
    Outliers are replaced with the median of the respective column.

    Args:
        df: A pandas DataFrame with numerical columns.
        threshold: The Z-score threshold. Values with an absolute Z-score
                   greater than this threshold are considered outliers.
                   Default is 3.

    Returns:
        A pandas DataFrame with outliers replaced by the median.
    """
    df_processed = df.copy()

    for col in df_processed.select_dtypes(include=np.number).columns:
        mean_val = df_processed[col].mean()
        std_dev = df_processed[col].std()

        if std_dev == 0:
            print(f"Warning: Standard deviation is zero for column '{col}'. Cannot calculate Z-score.")
            continue

        z_scores = np.abs(stats.zscore(df_processed[col]))
        outlier_indices = np.where(z_scores > threshold)[0]
        median_value = df_processed[col].median()

        # Replace outliers with the median
        df_processed.iloc[outlier_indices, df_processed.columns.get_loc(col)] = median_value

    return df_processed

# Example Usage:
data = {'col1': [10, 12, 15, 18, 20, 25, 30, 35, 40, 100],
        'col2': [5, 7, 9, 11, 13, 15, 17, 19, 21, -10]}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

df_handled = handle_outliers_zscore(df)
print("\nDataFrame after handling outliers (Z-score):")
print(df_handled)

# Example with a different threshold:
df_handled_threshold2 = handle_outliers_zscore(df.copy(), threshold=2)
print("\nDataFrame after handling outliers (Z-score, threshold=2):")
print(df_handled_threshold2)

# Example with a column having zero standard deviation:
data_zero_std = {'col1': [1, 1, 1, 1, 1],
                 'col2': [5, 7, 9, 11, 13]}
df_zero_std = pd.DataFrame(data_zero_std)
df_handled_zero_std = handle_outliers_zscore(df_zero_std.copy())
print("\nDataFrame with zero standard deviation column:")
print(df_handled_zero_std)

Original DataFrame:
   col1  col2
0    10     5
1    12     7
2    15     9
3    18    11
4    20    13
5    25    15
6    30    17
7    35    19
8    40    21
9   100   -10

DataFrame after handling outliers (Z-score):
   col1  col2
0    10     5
1    12     7
2    15     9
3    18    11
4    20    13
5    25    15
6    30    17
7    35    19
8    40    21
9   100   -10

DataFrame after handling outliers (Z-score, threshold=2):
   col1  col2
0  10.0     5
1  12.0     7
2  15.0     9
3  18.0    11
4  20.0    13
5  25.0    15
6  30.0    17
7  35.0    19
8  40.0    21
9  22.5    12

DataFrame with zero standard deviation column:
   col1  col2
0     1     5
1     1     7
2     1     9
3     1    11
4     1    13


  df_processed.iloc[outlier_indices, df_processed.columns.get_loc(col)] = median_value


In [4]:
import pandas as pd
import numpy as np
from scipy import stats

def handle_outliers_zscore(df, threshold=3):
    """
    Identifies and handles outliers in a pandas DataFrame using the Z-score method.
    Outliers are defined as values whose absolute Z-score exceeds the given threshold.
    Outliers are replaced with the median of the respective column.

    Args:
        df: A pandas DataFrame with numerical columns.
        threshold: The Z-score threshold. Values with an absolute Z-score
                   greater than this threshold are considered outliers.
                   Default is 3.

    Returns:
        A pandas DataFrame with outliers replaced by the median.
    """
    df_processed = df.copy()

    for col in df_processed.select_dtypes(include=np.number).columns:
        mean_val = df_processed[col].mean()
        std_dev = df_processed[col].std()

        if std_dev == 0:
            print(f"Warning: Standard deviation is zero for column '{col}'. Cannot calculate Z-score.")
            continue

        z_scores = np.abs(stats.zscore(df_processed[col]))
        outlier_indices = np.where(z_scores > threshold)[0]
        median_value = df_processed[col].median()

        # Replace outliers with the median
        df_processed.iloc[outlier_indices, df_processed.columns.get_loc(col)] = median_value

    return df_processed

# Example Usage:
data = {'col1': [10, 12, 15, 18, 20, 25, 30, 35, 40, 100],
        'col2': [5, 7, 9, 11, 13, 15, 17, 19, 21, -10]}
df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

df_handled = handle_outliers_zscore(df)
print("\nDataFrame after handling outliers (Z-score):")
print(df_handled)

# Example with a different threshold:
df_handled_threshold2 = handle_outliers_zscore(df.copy(), threshold=2)
print("\nDataFrame after handling outliers (Z-score, threshold=2):")
print(df_handled_threshold2)

# Example with a column having zero standard deviation:
data_zero_std = {'col1': [1, 1, 1, 1, 1],
                 'col2': [5, 7, 9, 11, 13]}
df_zero_std = pd.DataFrame(data_zero_std)
df_handled_zero_std = handle_outliers_zscore(df_zero_std.copy())
print("\nDataFrame with zero standard deviation column:")
print(df_handled_zero_std)

Original DataFrame:
   col1  col2
0    10     5
1    12     7
2    15     9
3    18    11
4    20    13
5    25    15
6    30    17
7    35    19
8    40    21
9   100   -10

DataFrame after handling outliers (Z-score):
   col1  col2
0    10     5
1    12     7
2    15     9
3    18    11
4    20    13
5    25    15
6    30    17
7    35    19
8    40    21
9   100   -10

DataFrame after handling outliers (Z-score, threshold=2):
   col1  col2
0  10.0     5
1  12.0     7
2  15.0     9
3  18.0    11
4  20.0    13
5  25.0    15
6  30.0    17
7  35.0    19
8  40.0    21
9  22.5    12

DataFrame with zero standard deviation column:
   col1  col2
0     1     5
1     1     7
2     1     9
3     1    11
4     1    13


  df_processed.iloc[outlier_indices, df_processed.columns.get_loc(col)] = median_value
