In [14]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.



In [15]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.



In [16]:
import pandas as pd
import io

# 1. Create a sample CSV file content with some missing values
# In a real scenario, you would be loading an existing CSV file from your system.
csv_data = """Name,Age,City,Occupation,Salary
Alice,24,New York,Engineer,70000
Bob,,San Francisco,Data Scientist,90000
Charlie,22,,Artist,45000
David,35,New York,Doctor,
Eve,28,Chicago,,55000
Frank,40,Houston,Manager,100000
Grace,,Boston,Developer,80000
"""

# Use io.StringIO to simulate reading a file from a string
# This is equivalent to having a file named 'sample_data_with_missing.csv'
# with the content above.
data_file = io.StringIO(csv_data)

# 2. Load the data: Use the pandas library to read a CSV file.
try:
    df = pd.read_csv(data_file)
    print("DataFrame loaded successfully:")
    print(df.head()) # Display the first few rows to see the data and potential NaNs
    print("\nDataFrame Info:")
    df.info() # Get a summary of the DataFrame including non-null counts
except Exception as e:
    print(f"Error loading CSV file: {e}")
    exit()

# 3. Check for missing values: Use the isnull() method to find missing values.
# This returns a DataFrame of boolean values (True for missing, False for present)
missing_values_boolean = df.isnull()
print("\nBoolean DataFrame showing missing values (True indicates missing):")
print(missing_values_boolean.head())

# 4. Summarize missing data: Use the sum() function to count the number of missing values in each column.
missing_values_count = df.isnull().sum()
print("\nNumber of missing values per column:")
print(missing_values_count)

print("\n--- Summary of Missing Data ---")
total_cells = df.size
total_missing = missing_values_count.sum()
percentage_missing = (total_missing / total_cells) * 100

print(f"Total cells in DataFrame: {total_cells}")
print(f"Total missing values across the DataFrame: {total_missing}")
print(f"Percentage of missing values in the DataFrame: {percentage_missing:.2f}%")

DataFrame loaded successfully:
      Name   Age           City      Occupation   Salary
0    Alice  24.0       New York        Engineer  70000.0
1      Bob   NaN  San Francisco  Data Scientist  90000.0
2  Charlie  22.0            NaN          Artist  45000.0
3    David  35.0       New York          Doctor      NaN
4      Eve  28.0        Chicago             NaN  55000.0

DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7 entries, 0 to 6
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        7 non-null      object 
 1   Age         5 non-null      float64
 2   City        6 non-null      object 
 3   Occupation  6 non-null      object 
 4   Salary      6 non-null      float64
dtypes: float64(2), object(3)
memory usage: 408.0+ bytes

Boolean DataFrame showing missing values (True indicates missing):
    Name    Age   City  Occupation  Salary
0  False  False  False       False   False
1  False   T

In [17]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.



In [18]:
import pandas as pd
import io

# 1. Create a sample CSV file content with some missing values
csv_data = """Name,Age,City,Occupation,Salary,Experience
Alice,24,New York,Engineer,70000,5
Bob,,San Francisco,Data Scientist,90000,
Charlie,22,,Artist,45000,2
David,35,New York,Doctor,,10
Eve,28,Chicago,,55000,
Frank,40,Houston,Manager,100000,15
Grace,,Boston,Developer,80000,7
"""

# Use io.StringIO to simulate reading a file from a string
data_file = io.StringIO(csv_data)

# Load the data into a Pandas DataFrame
df = pd.read_csv(data_file)

print("Original DataFrame (with missing values):")
print(df)
print("\nMissing values before dropping columns:")
print(df.isnull().sum())
print("-" * 50)

# 2. Use dropna() with axis=1 to remove columns with missing values.
# axis=1 or axis='columns' tells pandas to drop columns.
# By default, how='any' is used, meaning it will drop a column if it has *any* missing values.
df_cleaned_columns = df.dropna(axis=1)

print("\nDataFrame after dropping columns with ANY missing values:")
print(df_cleaned_columns)
print("\nMissing values in the cleaned DataFrame (should be 0 for all columns):")
print(df_cleaned_columns.isnull().sum())
print("-" * 50)

# You can also use 'how='all'' if you only want to drop columns where ALL values are missing
# For demonstration, let's add a column that's entirely missing
df_all_missing = df.copy()
df_all_missing['All_Missing'] = pd.NA # Or float('nan')
print("\nDataFrame with an entirely missing column:")
print(df_all_missing)
print("\nMissing values before dropping columns (with 'All_Missing'):")
print(df_all_missing.isnull().sum())

df_cleaned_columns_all = df_all_missing.dropna(axis=1, how='all')
print("\nDataFrame after dropping columns with ALL missing values (using how='all'):")
print(df_cleaned_columns_all)
print("\nMissing values in the 'how=all' cleaned DataFrame:")
print(df_cleaned_columns_all.isnull().sum())

Original DataFrame (with missing values):
      Name   Age           City      Occupation    Salary  Experience
0    Alice  24.0       New York        Engineer   70000.0         5.0
1      Bob   NaN  San Francisco  Data Scientist   90000.0         NaN
2  Charlie  22.0            NaN          Artist   45000.0         2.0
3    David  35.0       New York          Doctor       NaN        10.0
4      Eve  28.0        Chicago             NaN   55000.0         NaN
5    Frank  40.0        Houston         Manager  100000.0        15.0
6    Grace   NaN         Boston       Developer   80000.0         7.0

Missing values before dropping columns:
Name          0
Age           2
City          1
Occupation    1
Salary        1
Experience    2
dtype: int64
--------------------------------------------------

DataFrame after dropping columns with ANY missing values:
      Name
0    Alice
1      Bob
2  Charlie
3    David
4      Eve
5    Frank
6    Grace

Missing values in the cleaned DataFrame (should b

In [19]:
import pandas as pd
import io

# 1. Create a sample CSV file content with a numerical column having missing values
csv_data = """ProductID,Price,UnitsSold,Rating
P001,10.50,100,4.5
P002,,150,3.8
P003,25.00,,4.2
P004,12.75,200,
P005,15.00,120,4.0
P006,,90,3.5
P007,8.25,180,
"""

# Use io.StringIO to simulate reading a file from a string
data_file = io.StringIO(csv_data)

# Load the data into a Pandas DataFrame
df = pd.read_csv(data_file)

print("Original DataFrame:")
print(df)
print("\nMissing values before imputation:")
print(df.isnull().sum())
print("-" * 50)

# Identify the numerical column for imputation (e.g., 'Price' and 'UnitsSold')
# Let's focus on 'Price' first

# Convert 'Price' and 'UnitsSold' columns to numeric, coercing errors will turn invalid parsing into NaN
df['Price'] = pd.to_numeric(df['Price'], errors='coerce')
df['UnitsSold'] = pd.to_numeric(df['UnitsSold'], errors='coerce')


# 2. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.

# Calculate the mean of the 'Price' column
mean_price = df['Price'].mean()
print(f"\nCalculated mean of 'Price' column: {mean_price:.2f}")

# Fill missing values in 'Price' with its mean
df['Price'].fillna(mean_price, inplace=True) # Using inplace=True to modify the DataFrame directly

# Let's also impute 'UnitsSold' using its mean
mean_units_sold = df['UnitsSold'].mean()
print(f"Calculated mean of 'UnitsSold' column: {mean_units_sold:.2f}")
df['UnitsSold'].fillna(mean_units_sold, inplace=True)


print("\nDataFrame after mean imputation for 'Price' and 'UnitsSold':")
print(df)
print("\nMissing values after imputation:")
print(df.isnull().sum())
print("-" * 50)

# Verify the imputation (e.g., check specific rows where values were missing)
print("\nVerification (check rows P002 and P006 for 'Price', P003 for 'UnitsSold'):")
print(df.loc[df['ProductID'].isin(['P002', 'P003', 'P006'])])

Original DataFrame:
  ProductID  Price  UnitsSold  Rating
0      P001  10.50      100.0     4.5
1      P002    NaN      150.0     3.8
2      P003  25.00        NaN     4.2
3      P004  12.75      200.0     NaN
4      P005  15.00      120.0     4.0
5      P006    NaN       90.0     3.5
6      P007   8.25      180.0     NaN

Missing values before imputation:
ProductID    0
Price        2
UnitsSold    1
Rating       2
dtype: int64
--------------------------------------------------

Calculated mean of 'Price' column: 14.30
Calculated mean of 'UnitsSold' column: 140.00

DataFrame after mean imputation for 'Price' and 'UnitsSold':
  ProductID  Price  UnitsSold  Rating
0      P001  10.50      100.0     4.5
1      P002  14.30      150.0     3.8
2      P003  25.00      140.0     4.2
3      P004  12.75      200.0     NaN
4      P005  15.00      120.0     4.0
5      P006  14.30       90.0     3.5
6      P007   8.25      180.0     NaN

Missing values after imputation:
ProductID    0
Price        0

In [20]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.



In [21]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.



In [22]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.



In [23]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.



In [24]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.



In [25]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.




In [26]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.

