In [1]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.

import pandas as pd
import numpy as np

# Sample data
data = {'A': ['cat', 'dog', 'cat', 'dog', 'bird'],
        'B': [10, np.nan, 15, np.nan, 5]}
df = pd.DataFrame(data)

# Define a function for conditional filling
def fill_based_on_condition(row):
    if pd.isna(row['B']):
        if row['A'] == 'dog':
            return 20
        else:
            return row['B']  # keep NaN or fill some default
    else:
        return row['B']

# Apply the function row-wise
df['B'] = df.apply(fill_based_on_condition, axis=1)

print(df)


      A     B
0   cat  10.0
1   dog  20.0
2   cat  15.0
3   dog  20.0
4  bird   5.0


In [None]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.



In [2]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.

import pandas as pd
import numpy as np

# Sample data
data = {'Name': ['Alice', 'Bob', 'Charlie', 'David'],
        'Age': [25, np.nan, 30, np.nan]}
df = pd.DataFrame(data)

# Step 1: Fill missing values in 'Age' with a default, e.g., 0 or the mean age
df['Age'] = df['Age'].fillna(0)  # or use df['Age'].fillna(df['Age'].mean())

# Step 2: Convert 'Age' column to integer type
df['Age'] = df['Age'].astype(int)

print(df)


      Name  Age
0    Alice   25
1      Bob    0
2  Charlie   30
3    David    0


In [3]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.

import pandas as pd

def clean_data(df, fillna_dict=None, fillna_value=None):
    """
    Automates data cleaning:
    - Fill missing values per column or a default value.
    - Remove duplicate rows.
    - Standardize column names (lowercase, underscores instead of spaces).

    Parameters:
        df (pd.DataFrame): Input dataframe to clean.
        fillna_dict (dict, optional): Dictionary of {column: fill_value} to fill missing values.
        fillna_value (scalar, optional): Default fill value for all missing values (used if fillna_dict is None).
    
    Returns:
        pd.DataFrame: Cleaned dataframe.
    """
    df = df.copy()

    # Fill missing values
    if fillna_dict is not None:
        for col, val in fillna_dict.items():
            if col in df.columns:
                df[col] = df[col].fillna(val)
    elif fillna_value is not None:
        df = df.fillna(fillna_value)

    # Remove duplicates
    df = df.drop_duplicates()

    # Standardize column names
    df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')

    return df



In [4]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.

import pandas as pd

# Sample data
data = {'Score': [20, 15, 30, 10, 25]}
df = pd.DataFrame(data)

# Min-max normalization
df['Score_normalized'] = (df['Score'] - df['Score'].min()) / (df['Score'].max() - df['Score'].min())

print(df)


   Score  Score_normalized
0     20              0.50
1     15              0.25
2     30              1.00
3     10              0.00
4     25              0.75


In [5]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
df['Score_scaled_sklearn'] = scaler.fit_transform(df[['Score']])

print(df)


   Score  Score_normalized  Score_scaled_sklearn
0     20              0.50                  0.50
1     15              0.25                  0.25
2     30              1.00                  1.00
3     10              0.00                  0.00
4     25              0.75                  0.75
