In [None]:
#  Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.
# Fill missing salary values with the mean salary of the same department
import pandas as pd
import numpy as np
df = pd.DataFrame({
    "employee": ["Alice", "Bob", "Charlie", "David", "Eve"],
    "department": ["HR", "Engineering", "HR", "Engineering", "HR"],
    "salary": [50000, np.nan, np.nan, 70000, 52000]
})
df["salary"] = df.groupby("department")["salary"].transform(
    lambda x: x.fillna(x.mean())
)
print(df)


In [None]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.
import pandas as pd
import numpy as np
from scipy import stats
df = pd.DataFrame({
    "salary": [50000, 52000, 51000, 49000, 51500, 120000]
})
z_scores = np.abs(stats.zscore(df["salary"]))
threshold = 3
df_clean = df[z_scores < threshold]
print("Cleaned Data:\n", df_clean)

In [None]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.
import pandas as pd
import numpy as np
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [25, np.nan, 30, np.nan]
})
df['Age'].fillna(df['Age'].median(), inplace=True)
df['Age'] = df['Age'].astype(int)
print(df)

In [None]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.
import pandas as pd
def clean_data(df, fillna_dict=None, remove_duplicates=True):
    """
    Automates the process of data cleaning:
    - Fill missing values based on column-wise definitions.
    - Remove duplicates if specified.
    - Standardize column names to lowercase and replace spaces with underscores.
    Parameters:
    - df: DataFrame to be cleaned.
    - fillna_dict: Dictionary defining columns and values to fill for missing data (optional).
    - remove_duplicates: Boolean flag to indicate if duplicates should be removed (default=True).
    Returns:
    - Cleaned DataFrame.
    """
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    if fillna_dict:
        for column, value in fillna_dict.items():
            if column in df.columns:
                df[column].fillna(value, inplace=True)
    if remove_duplicates:
        df.drop_duplicates(inplace=True)
    
    return df
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'Alice', 'David'],
    'Age': [25, None, 30, 25, None],
    'Salary': [50000, 60000, None, 50000, 70000]
}
df = pd.DataFrame(data)
cleaned_df = clean_data(df, fillna_dict={'Age': 28, 'Salary': 55000})
print("Cleaned DataFrame:\n", cleaned_df)


In [None]:
#  Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.
import pandas as pd
df = pd.DataFrame({
    'Age': [25, 28, 30, 22, 35]
})
df['Age_normalized'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() - df['Age'].min())
print(df)

