In [156]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.



In [157]:
import pandas as pd

# Create a sample dataframe with missing values
data = {
    'Name': ['Alice', 'Bob', 'Charlie', None, 'Edward'],
    'Age': [25, 30, None, 22, 40],
    'City': ['New York', None, 'Los Angeles', 'Chicago', 'Houston']
}

df = pd.DataFrame(data)

# Save this dataframe as a CSV file
df.to_csv('sample_data.csv', index=False)

# Now load the CSV back (simulating your file)
df_loaded = pd.read_csv('sample_data.csv')

# Detect missing values
print("Missing values per column:")
print(df_loaded.isnull().sum())


Missing values per column:
Name    1
Age     1
City    1
dtype: int64


In [158]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.

In [159]:
import pandas as pd

# Sample data with missing values
data = {
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [25, None, 30, 22],
    'City': ['New York', 'Los Angeles', 'Chicago', None]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Drop rows with any missing values
df_dropped = df.dropna()

print("\nDataFrame after dropping rows with missing values:")
print(df_dropped)


Original DataFrame:
    Name   Age         City
0  Alice  25.0     New York
1    Bob   NaN  Los Angeles
2   None  30.0      Chicago
3  David  22.0         None

DataFrame after dropping rows with missing values:
    Name   Age      City
0  Alice  25.0  New York


In [160]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.



In [161]:
import pandas as pd

# Sample data with missing values
data = {
    'Name': ['Alice', 'Bob', None, 'David'],
    'Age': [25, None, 30, 22],
    'City': ['New York', 'Los Angeles', 'Chicago', None]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Drop columns with any missing values
df_dropped_cols = df.dropna(axis=1)

print("\nDataFrame after dropping columns with missing values:")
print(df_dropped_cols)


Original DataFrame:
    Name   Age         City
0  Alice  25.0     New York
1    Bob   NaN  Los Angeles
2   None  30.0      Chicago
3  David  22.0         None

DataFrame after dropping columns with missing values:
Empty DataFrame
Columns: []
Index: [0, 1, 2, 3]


In [162]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.



In [163]:
import pandas as pd

# Sample data with missing numerical values
data = {
    'Age': [25, None, 30, 22, None],
    'Salary': [50000, 60000, None, 52000, 58000]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Calculate the mean of the 'Age' column (ignoring missing values)
mean_age = df['Age'].mean()

# Step 2: Fill missing values in 'Age' column with the mean
df['Age'] = df['Age'].fillna(mean_age)

print("\nDataFrame after mean imputation on 'Age':")
print(df)


Original DataFrame:
    Age   Salary
0  25.0  50000.0
1   NaN  60000.0
2  30.0      NaN
3  22.0  52000.0
4   NaN  58000.0

DataFrame after mean imputation on 'Age':
         Age   Salary
0  25.000000  50000.0
1  25.666667  60000.0
2  30.000000      NaN
3  22.000000  52000.0
4  25.666667  58000.0


In [164]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.



In [165]:
import pandas as pd

# Sample data with missing categorical values
data = {
    'City': ['New York', 'Los Angeles', None, 'Chicago', 'Los Angeles', None]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Calculate the mode of the 'City' column
mode_city = df['City'].mode()[0]  # mode() returns a Series, take the first value

# Step 2: Fill missing values in 'City' column with the mode
df['City'] = df['City'].fillna(mode_city)

print("\nDataFrame after mode imputation on 'City':")
print(df)


Original DataFrame:
          City
0     New York
1  Los Angeles
2         None
3      Chicago
4  Los Angeles
5         None

DataFrame after mode imputation on 'City':
          City
0     New York
1  Los Angeles
2  Los Angeles
3      Chicago
4  Los Angeles
5  Los Angeles


In [166]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.



In [167]:
import pandas as pd

# Sample data with missing values and skewed distribution
data = {
    'Income': [30000, 40000, 250000, None, 32000, None, 35000]
}

df = pd.DataFrame(data)

print("Original DataFrame:")
print(df)

# Step 1: Calculate median of the 'Income' column
median_income = df['Income'].median()

# Step 2: Fill missing values in 'Income' column with the median
df['Income'] = df['Income'].fillna(median_income)

print("\nDataFrame after median imputation on 'Income':")
print(df)


Original DataFrame:
     Income
0   30000.0
1   40000.0
2  250000.0
3       NaN
4   32000.0
5       NaN
6   35000.0

DataFrame after median imputation on 'Income':
     Income
0   30000.0
1   40000.0
2  250000.0
3   35000.0
4   32000.0
5   35000.0
6   35000.0


In [168]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.



In [169]:
import pandas as pd
from sklearn.impute import KNNImputer

# Sample data with missing values
data = {
    'Feature1': [1.0, 2.0, None, 4.0, 5.0],
    'Feature2': [5.0, None, 7.0, 8.0, 10.0],
    'Feature3': [10.0, 15.0, 14.0, None, 18.0]
}

df = pd.DataFrame(data)
print("Original DataFrame:")
print(df)

# Step 1: Initialize the KNN imputer (default n_neighbors=5)
imputer = KNNImputer(n_neighbors=2)

# Step 2: Fit and transform the data to fill missing values
df_imputed = pd.DataFrame(imputer.fit_transform(df), columns=df.columns)

print("\nDataFrame after KNN imputation:")
print(df_imputed)


Original DataFrame:
   Feature1  Feature2  Feature3
0       1.0       5.0      10.0
1       2.0       NaN      15.0
2       NaN       7.0      14.0
3       4.0       8.0       NaN
4       5.0      10.0      18.0

DataFrame after KNN imputation:
   Feature1  Feature2  Feature3
0       1.0       5.0      10.0
1       2.0       7.5      15.0
2       3.0       7.0      14.0
3       4.0       8.0      16.0
4       5.0      10.0      18.0


In [170]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.



In [171]:
import pandas as pd

# Sample dataset with missing categorical data
data = {
    'Category': ['Apple', 'Banana', 'Apple', None, 'Orange', 'Banana', None, 'Banana']
}

df = pd.DataFrame(data)

# Step 1: Identify missing values in the categorical column
missing = df['Category'].isnull()
print("Missing values in 'Category' column:")
print(missing)

# Step 2: Impute with the next most frequent category (2nd mode)
value_counts = df['Category'].value_counts()
print("\nValue counts:")
print(value_counts)

if len(value_counts) > 1:
    second_most_common = value_counts.index[1]  # second most frequent
else:
    second_most_common = value_counts.index[0]  # fallback to most frequent

# Fill missing values
df['Category'].fillna(second_most_common, inplace=True)

print("\nDataFrame after imputation:")
print(df)


Missing values in 'Category' column:
0    False
1    False
2    False
3     True
4    False
5    False
6     True
7    False
Name: Category, dtype: bool

Value counts:
Category
Banana    3
Apple     2
Orange    1
Name: count, dtype: int64

DataFrame after imputation:
  Category
0    Apple
1   Banana
2    Apple
3    Apple
4   Orange
5   Banana
6    Apple
7   Banana


In [172]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.




In [173]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

# Sample dataset
data = {
    'Salary': [50000, 60000, 52000, 58000, 61000, 57000, 62000, 49000],
    'EducationLevel': [1, 2, 1, 2, 3, 2, 3, 1],
    'Age': [25, 30, np.nan, 28, 35, np.nan, 40, 22]
}
df = pd.DataFrame(data)

# Step 1: Partition the data
df_known = df[df['Age'].notnull()]
df_missing = df[df['Age'].isnull()]

# Features and target
X_train = df_known.drop('Age', axis=1)
y_train = df_known['Age']
X_missing = df_missing.drop('Age', axis=1)

# Step 2: Train a regression model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 3: Predict and impute missing values
predicted_ages = model.predict(X_missing)
df.loc[df['Age'].isnull(), 'Age'] = predicted_ages

# Final DataFrame after imputation
print("DataFrame after predictive imputation:")
print(df)


DataFrame after predictive imputation:
   Salary  EducationLevel        Age
0   50000               1  25.000000
1   60000               2  30.000000
2   52000               1  22.931034
3   58000               2  28.000000
4   61000               3  35.000000
5   57000               2  29.982759
6   62000               3  40.000000
7   49000               1  22.000000


In [174]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.



In [None]:
import pandas as pd
import numpy as np

# Sample time series data with missing values
data = {
    'Date': pd.date_range(start='2024-01-01', periods=10, freq='D'),
    'Value': [10, np.nan, np.nan, 40, 50, np.nan, 70, np.nan, np.nan, 100]
}
df = pd.DataFrame(data)

# Step 1: Sort the data by Date
df = df.sort_values('Date')

# Step 2: Forward fill
df['Value_ffill'] = df['Value'].fillna(method='ffill')

# Step 3: Backward fill
df['Value_bfill'] = df['Value'].fillna(method='bfill')

# Step 4: Combine both for more complete fill (optional)
df['Value_combined'] = df['Value'].fillna(method='ffill').fillna(method='bfill')

# Display result
print(df)


        Date  Value  Value_ffill  Value_bfill  Value_combined
0 2024-01-01   10.0         10.0         10.0            10.0
1 2024-01-02    NaN         10.0         40.0            10.0
2 2024-01-03    NaN         10.0         40.0            10.0
3 2024-01-04   40.0         40.0         40.0            40.0
4 2024-01-05   50.0         50.0         50.0            50.0
5 2024-01-06    NaN         50.0         70.0            50.0
6 2024-01-07   70.0         70.0         70.0            70.0
7 2024-01-08    NaN         70.0        100.0            70.0
8 2024-01-09    NaN         70.0        100.0            70.0
9 2024-01-10  100.0        100.0        100.0           100.0


  df['Value_ffill'] = df['Value'].fillna(method='ffill')
  df['Value_bfill'] = df['Value'].fillna(method='bfill')
  df['Value_combined'] = df['Value'].fillna(method='ffill').fillna(method='bfill')
