In [None]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.



In [None]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.



In [None]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.



In [None]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.



In [None]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.



In [1]:
import pandas as pd
import numpy as np
from scipy import stats

# Sample DataFrame
data = {
    'ID': [1, 2, 3, 4, 5, 6, 7],
    'Age': [25, np.nan, 35, 45, 300, 22, np.nan],
    'Group': ['A', 'A', 'B', 'B', 'B', 'A', 'B'],
    'Score': [55, 65, 85, 45, 1000, 70, 60]
}
df = pd.DataFrame(data)

# Question 1: Handling Missing Values with Conditional Filling
def fill_missing_by_condition(df, target_col, condition_col, condition_val, fill_value):
    mask = (df[condition_col] == condition_val) & (df[target_col].isnull())
    df.loc[mask, target_col] = fill_value
    return df

df = fill_missing_by_condition(df, target_col='Age', condition_col='Group', condition_val='A', fill_value=30)
print("After Conditional Filling:\n", df)

# Question 2: Removing Outliers by Rescaling using Z-score
def remove_outliers_zscore(df, column, z_thresh=3):
    z_scores = np.abs(stats.zscore(df[column].fillna(df[column].mean())))
    df_clean = df[z_scores < z_thresh].copy()
    return df_clean

df = remove_outliers_zscore(df, 'Score')
print("\nAfter Removing Outliers:\n", df)

# Question 3: Applying Data Type Conversion
df['Age'] = df['Age'].fillna(df['Age'].mean())  # Fill remaining missing with mean
df['Age'] = df['Age'].astype(int)
print("\nAfter Age Conversion:\n", df)

# Question 4: Automating Data Cleaning with Functions
def clean_data(df):
    # Fill missing Age based on Group 'A' with 30 (example rule)
    df = fill_missing_by_condition(df, 'Age', 'Group', 'A', 30)
    # Remove duplicates
    df = df.drop_duplicates()
    # Standardize column names (lowercase and strip)
    df.columns = df.columns.str.lower().str.strip()
    return df

df = clean_data(df)
print("\nAfter Automated Cleaning:\n", df)

# Question 5: Complex Data Normalization (Min-Max Scaling)
def min_max_normalize(series):
    return (series - series.min()) / (series.max() - series.min())

df['score_normalized'] = min_max_normalize(df['score'])
print("\nAfter Min-Max Normalization:\n", df)


After Conditional Filling:
    ID    Age Group  Score
0   1   25.0     A     55
1   2   30.0     A     65
2   3   35.0     B     85
3   4   45.0     B     45
4   5  300.0     B   1000
5   6   22.0     A     70
6   7    NaN     B     60

After Removing Outliers:
    ID    Age Group  Score
0   1   25.0     A     55
1   2   30.0     A     65
2   3   35.0     B     85
3   4   45.0     B     45
4   5  300.0     B   1000
5   6   22.0     A     70
6   7    NaN     B     60

After Age Conversion:
    ID  Age Group  Score
0   1   25     A     55
1   2   30     A     65
2   3   35     B     85
3   4   45     B     45
4   5  300     B   1000
5   6   22     A     70
6   7   76     B     60

After Automated Cleaning:
    id  age group  score
0   1   25     A     55
1   2   30     A     65
2   3   35     B     85
3   4   45     B     45
4   5  300     B   1000
5   6   22     A     70
6   7   76     B     60

After Min-Max Normalization:
    id  age group  score  score_normalized
0   1   25     A    

   id  age group  score  score_normalized
0   1   25     A     55          0.010471
1   2   30     A     65          0.020942
2   3   35     B     85          0.041885
3   4   45     B     45          0.000000
4   5  300     B   1000          1.000000
5   6   22     A     70          0.026178
6   7   76     B     60          0.015707
