In [31]:
# Question 1: Handling Missing Values with Conditional Filling
# Description: Fill missing values in a specific column based on a condition from another column.



In [32]:
import pandas as pd
import numpy as np

# Sample dataset
data = {
    'group': ['A', 'A', 'B', 'B', 'A', 'B'],
    'score': [95, np.nan, 80, np.nan, np.nan, 85]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)


Original DataFrame:
  group  score
0     A   95.0
1     A    NaN
2     B   80.0
3     B    NaN
4     A    NaN
5     B   85.0


In [33]:
# Question 2: Removing Outliers by Rescaling
# Description: Remove outliers by standardizing a numerical column using z-scores.



In [34]:
import pandas as pd
import numpy as np
from scipy import stats

# Step 1: Create a sample dataset
data = {
    'value': [10, 12, 13, 15, 14, 100, 13, 14, 15, 16, 1000]  # 100 and 1000 are likely outliers
}

df = pd.DataFrame(data)
print("Original Data:")
print(df)

# Step 2: Calculate z-scores
df['z_score'] = stats.zscore(df['value'])

# Step 3: Filter out rows where the absolute z-score is greater than 3 (commonly used threshold)
df_no_outliers = df[df['z_score'].abs() <= 3].copy()

# Step 4: Drop the z_score column if no longer needed
df_no_outliers.drop(columns='z_score', inplace=True)

print("\nData after Removing Outliers:")
print(df_no_outliers)


Original Data:
    value
0      10
1      12
2      13
3      15
4      14
5     100
6      13
7      14
8      15
9      16
10   1000

Data after Removing Outliers:
   value
0     10
1     12
2     13
3     15
4     14
5    100
6     13
7     14
8     15
9     16


In [35]:
# Question 3: Applying Data Type Conversion
# Description: Convert the 'Age' column to integers after filling missing values.



In [36]:
import pandas as pd
import numpy as np

# Step 1: Create a sample DataFrame with missing Age values
df = pd.DataFrame({
    'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'Age': [25, np.nan, 30, np.nan, 40]
})

print("Original Data:")
print(df)

# Step 2: Fill missing Age values (e.g., with the mean age)
df['Age'] = df['Age'].fillna(df['Age'].mean())

# Step 3: Convert Age to integers
df['Age'] = df['Age'].astype(int)

print("\nData After Filling and Converting Age to Integer:")
print(df)


Original Data:
      Name   Age
0    Alice  25.0
1      Bob   NaN
2  Charlie  30.0
3    David   NaN
4      Eve  40.0

Data After Filling and Converting Age to Integer:
      Name  Age
0    Alice   25
1      Bob   31
2  Charlie   30
3    David   31
4      Eve   40


In [37]:
# Question 4: Automating Data Cleaning with Functions
# Description: Create a function that automates the process of filling missing values, removing duplicates, and standardizing column names.



In [38]:
import pandas as pd

# Function to clean a DataFrame
def clean_data(df, fill_method='mean'):
    """
    Clean a DataFrame by:
    1. Filling missing numeric values
    2. Removing duplicate rows
    3. Standardizing column names
    
    Parameters:
    - df: pandas DataFrame
    - fill_method: 'mean', 'median', or 'zero'
    
    Returns:
    - Cleaned pandas DataFrame
    """
    # Standardize column names
    df.columns = [col.strip().lower().replace(' ', '_') for col in df.columns]

    # Fill missing values for numeric columns
    for col in df.select_dtypes(include=['number']).columns:
        if fill_method == 'mean':
            df[col] = df[col].fillna(df[col].mean())
        elif fill_method == 'median':
            df[col] = df[col].fillna(df[col].median())
        elif fill_method == 'zero':
            df[col] = df[col].fillna(0)

    # Remove duplicate rows
    df = df.drop_duplicates()

    return df


In [39]:
# Question 5: Complex Data Normalization
# Description: Normalize a numeric column to a range using min-max scaling.



In [40]:
import pandas as pd

# Sample DataFrame
df = pd.DataFrame({
    'salary': [30000, 50000, 70000, 100000, 120000]
})

# Min-Max Normalization
def min_max_normalize(column):
    min_val = column.min()
    max_val = column.max()
    return (column - min_val) / (max_val - min_val)

# Apply normalization to the 'salary' column
df['normalized_salary'] = min_max_normalize(df['salary'])

print(df)


   salary  normalized_salary
0   30000           0.000000
1   50000           0.222222
2   70000           0.444444
3  100000           0.777778
4  120000           1.000000
