In [1]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.
import pandas as pd
import numpy as np

# Sample DataFrame with missing values
data = {
    'Category': ['A', 'A', 'B', 'B', 'C', 'C'],
    'Value': [10, np.nan, 20, np.nan, 30, np.nan]
}
df = pd.DataFrame(data)

print("Before filling missing values:")
print(df)

# Define a mapping from Category to fill values for missing 'Value'
fill_values = {
    'A': 100,
    'B': 200,
    'C': 300
}

# Fill missing 'Value' based on 'Category'
df['Value'] = df.apply(
    lambda row: fill_values[row['Category']] if pd.isna(row['Value']) else row['Value'],
    axis=1
)

print("\nAfter conditional filling of missing values:")
print(df)





Before filling missing values:
  Category  Value
0        A   10.0
1        A    NaN
2        B   20.0
3        B    NaN
4        C   30.0
5        C    NaN

After conditional filling of missing values:
  Category  Value
0        A   10.0
1        A  100.0
2        B   20.0
3        B  200.0
4        C   30.0
5        C  300.0


In [2]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.
import pandas as pd
import numpy as np
from scipy import stats

# Sample DataFrame with outliers
data = {
    'Value': [10, 12, 11, 13, 12, 200, 11, 14, 10, 9]
}
df = pd.DataFrame(data)

print("Original Data:")
print(df)

# Calculate z-scores for the 'Value' column
df['z_score'] = stats.zscore(df['Value'])

# Define a threshold for outliers (e.g., |z| > 3)
threshold = 3

# Filter out outliers
df_clean = df[df['z_score'].abs() <= threshold].copy()

# Drop the z_score column if not needed
df_clean.drop(columns='z_score', inplace=True)

print("\nData after removing outliers:")
print(df_clean)


Original Data:
   Value
0     10
1     12
2     11
3     13
4     12
5    200
6     11
7     14
8     10
9      9

Data after removing outliers:
   Value
0     10
1     12
2     11
3     13
4     12
5    200
6     11
7     14
8     10
9      9


In [3]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.

import pandas as pd
import numpy as np

data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': ['25', np.nan, '30', '28.0']
}

df = pd.DataFrame(data)

# Fill missing values in 'Age' with the median (converted to float first)
df['Age'] = pd.to_numeric(df['Age'], errors='coerce')  # Convert to numeric, coercing errors to NaN
median_age = df['Age'].median()
df['Age'].fillna(median_age, inplace=True)

# Convert 'Age' column to integers
df['Age'] = df['Age'].astype(int)

print(df)

      Name  Age
0    Alice   25
1      Bob   28
2  Charlie   30
3    David   28


In [4]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.

import pandas as pd

def clean_data(df):
    # Fill missing values with column median for numeric columns
    for col in df.select_dtypes(include=['number']).columns:
        median_val = df[col].median()
        df[col].fillna(median_val, inplace=True)
    
    # Remove duplicate rows
    df.drop_duplicates(inplace=True)
    
    # Standardize column names: lowercase and replace spaces with underscores
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    
    return df

# Example usage
data = {
    'Employee Name': ['Alice', 'Bob', 'Alice', None],
    'Age': [25, None, 25, 30],
    'Salary': [50000, 60000, 50000, None]
}

df = pd.DataFrame(data)
cleaned_df = clean_data(df)

print(cleaned_df)

  employee_name   age   salary
0         Alice  25.0  50000.0
1           Bob  25.0  60000.0
3          None  30.0  50000.0


In [5]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.

import pandas as pd

def min_max_normalize(df, column):
    min_val = df[column].min()
    max_val = df[column].max()
    df[column + '_normalized'] = (df[column] - min_val) / (max_val - min_val)
    return df

# Example usage
data = {
    'Age': [25, 30, 22, 40, 35]
}

df = pd.DataFrame(data)
df = min_max_normalize(df, 'Age')

print(df)

   Age  Age_normalized
0   25        0.166667
1   30        0.444444
2   22        0.000000
3   40        1.000000
4   35        0.722222
