In [1]:
# Question: Introduction to Missing Data in a DataFrame
# Description: Load a simple CSV file into a DataFrame and identify missing values.

# Steps to follow:
# 1. Load the data: Use the pandas library to read a CSV file.
# 2. Check for missing values: Use the isnull() method to find missing values.
# 3. Summarize missing data: Use the sum() function to count the number of missing values in each column.

import pandas as pd

# Step 1: Load the data from a CSV file into a DataFrame
df = pd.read_csv('/workspaces/AI_DATA_ANALYSIS_/src/Module 3/common_data_errors_example.csv')

# Step 2: Check for missing values in the DataFrame
missing_values = df.isnull()

# Step 3: Summarize the missing data by counting the number of missing values in each column
missing_summary = df.isnull().sum()

print("Missing values in each column:")
print(missing_summary)


Missing values in each column:
customer_id          0
email               10
transaction_date    14
department          26
supplier_name        0
product_id           0
date_column          0
phone                0
state                0
month                0
revenue              0
quarter              0
engagement_score     0
price                0
dtype: int64


In [3]:
# Question: Dropping Rows with Missing Values
# Description: Practice the deletion method by removing rows with any missing values from a dataset.

# Steps to follow:
# 1. Use dropna() method: Use the dropna() method to remove rows with missing values.

import pandas as pd

# Load the CSV data into a DataFrame
df = pd.read_csv('/workspaces/AI_DATA_ANALYSIS_/src/Module 3/common_data_errors_example.csv')

# Drop rows that contain any missing values
df_cleaned = df.dropna()

print("DataFrame after dropping rows with missing values:")
print(df_cleaned)


DataFrame after dropping rows with missing values:
    customer_id               email transaction_date department supplier_name  \
0             1   user1@example.com       2024-07-22         IT    Supplier 0   
1             2   user2@example.com       2024-11-26         HR    Supplier 1   
2             3   user3@example.com       2024-04-05         HR    Supplier 2   
3             4   user4@example.com       2024-12-13         IT    Supplier 3   
4             5   user5@example.com       2024-11-15      Sales    Supplier 4   
7             8   user8@example.com       2024-05-28      Sales    Supplier 2   
8             9   user9@example.com       2024-09-02         HR    Supplier 3   
11           12  user12@example.com       2024-09-15         IT    Supplier 1   
12           13  user13@example.com       2024-08-04         HR    Supplier 2   
14           15  user15@example.com       2024-07-07         IT    Supplier 4   
15           16  user16@example.com       2024-08-15      

In [4]:
# Question: Dropping Columns with Missing Values
# Description: Practice deleting entire columns that contain missing values.

# Steps to follow:
# 1. Use dropna() with axis parameter: Set axis=1 in dropna() to remove columns with missing values.

import pandas as pd

# Load the CSV data into a DataFrame
df = pd.read_csv('/workspaces/AI_DATA_ANALYSIS_/src/Module 3/common_data_errors_example.csv')

# Drop columns that contain any missing values
df_cleaned = df.dropna(axis=1)

print("DataFrame after dropping columns with missing values:")
print(df_cleaned)


DataFrame after dropping columns with missing values:
    customer_id supplier_name product_id date_column           phone  \
0             1    Supplier 0         P1  06/23/2024  (555) 123-1000   
1             2    Supplier 1         P2  2024-06-27    555-123-1001   
2             3    Supplier 2         P3  2024-07-05    555-123-1002   
3             4    Supplier 3         P4  06/26/2024    555-123-1003   
4             5    Supplier 4         P5  2024-06-10  (555) 123-1004   
..          ...           ...        ...         ...             ...   
95           96    Supplier 0        P16  2024-12-01    555-123-1095   
96           97    Supplier 1        P17  06/12/2024  (555) 123-1096   
97           98    Supplier 2        P18  2024-12-17    555-123-1097   
98           99    Supplier 3        P19  2024-05-21    555-123-1098   
99          100    Supplier 4        P20  05/23/2024    555-123-1099   

         state month  revenue quarter  engagement_score       price  
0       Cal

In [14]:
# Question: Mean Imputation for Numerical Data
# Description: Fill missing values in a numerical column with the mean of that column.

# Steps to follow:
# 1. Calculate mean and fill NA: Use mean() to calculate and fillna() to fill the missing values.

import pandas as pd

# Load the CSV data into a DataFrame
df = pd.read_csv('/workspaces/AI_DATA_ANALYSIS_/src/Module 3/Techniques to Improve Data Quality/employees.csv')

# Calculate the mean of the numerical column 'Sales'
mean_value = df['salary'].mean()

# Fill missing values in the 'Sales' column with the mean value
df['salary'].fillna(mean_value, inplace=True)

print("DataFrame after mean imputation on 'Sales' column:")
print(df)


DataFrame after mean imputation on 'Sales' column:
   employee_id             name   age department        salary  \
0            1         John Doe  28.0      Sales  55000.000000   
1            2       Jane Smith  34.0        NaN  62000.000000   
2            3      Bob Johnson   NaN  Marketing  58000.000000   
3            4   Alice Williams  29.0  Marketing  62666.666667   
4            5      Chris Evans  45.0      Sales  70000.000000   
5            6      Emily Davis  38.0         HR  67000.000000   
6            7              NaN  41.0         HR  65000.000000   
7            8       Mary Clark  30.0      Sales  59000.000000   
8            9      James Lewis  37.0        NaN  62000.000000   
9           10  Patricia Taylor  32.0         HR  66000.000000   

                         email  
0         john.doe@example.com  
1       jane.smith@example.com  
2      bob.johnson@example.com  
3   alice.williams@example.com  
4                          NaN  
5      emily.davis@examp

In [10]:
print(df.columns)


Index(['sales', 'customer_id'], dtype='object')


In [15]:
# Question: Mode Imputation for Categorical Data
# Description: Fill missing values in a categorical column with the mode of that column.

# Steps to follow:
# 1. Calculate mode and fill NA: Use mode() to find the most frequent value and fillna() to fill the missing values.

import pandas as pd
from io import StringIO

# Sample CSV data as a string (simulate reading from a file)
data = """
employee_id,name,age,department,salary,email
1,John Doe,28,Sales,55000,john.doe@example.com
2,Jane Smith,34,,62000,jane.smith@example.com
3,Bob Johnson,,Marketing,58000,bob.johnson@example.com
4,Alice Williams,29,Marketing,,alice.williams@example.com
5,Chris Evans,45,Sales,70000,
6,Emily Davis,38,HR,67000,emily.davis@example.com
7,,41,HR,65000,michael.brown@example.com
8,Mary Clark,30,Sales,59000,mary.clark@example.com
9,James Lewis,37,,62000,james.lewis@example.com
10,Patricia Taylor,32,HR,66000,patricia.taylor@example.com
"""

# Read the data into a DataFrame
df = pd.read_csv(StringIO(data))

# Check the columns and missing values in 'department'
print("Missing values in 'department' before imputation:", df['department'].isnull().sum())

# Calculate the mode of the 'department' column
mode_department = df['department'].mode()[0]
print("Mode of 'department':", mode_department)

# Fill missing values in 'department' with the mode
df['department'].fillna(mode_department, inplace=True)

print("\nMissing values in 'department' after imputation:", df['department'].isnull().sum())
print("\nDataFrame after mode imputation on 'department':")
print(df)


Missing values in 'department' before imputation: 2
Mode of 'department': HR

Missing values in 'department' after imputation: 0

DataFrame after mode imputation on 'department':
   employee_id             name   age department   salary  \
0            1         John Doe  28.0      Sales  55000.0   
1            2       Jane Smith  34.0         HR  62000.0   
2            3      Bob Johnson   NaN  Marketing  58000.0   
3            4   Alice Williams  29.0  Marketing      NaN   
4            5      Chris Evans  45.0      Sales  70000.0   
5            6      Emily Davis  38.0         HR  67000.0   
6            7              NaN  41.0         HR  65000.0   
7            8       Mary Clark  30.0      Sales  59000.0   
8            9      James Lewis  37.0         HR  62000.0   
9           10  Patricia Taylor  32.0         HR  66000.0   

                         email  
0         john.doe@example.com  
1       jane.smith@example.com  
2      bob.johnson@example.com  
3   alice.william

In [17]:
# Question: Median Imputation for Skewed Data
# Description: Handle missing values in columns with a skewed distribution using the median.

# Steps to follow:
# 1. Calculate median and fill NA: Use median() for skewed data and fillna() to handle missing values.

import pandas as pd
from io import StringIO

# CSV data as a string (simulate reading from a file)
data = """
employee_id,name,age,department,salary,email
1,John Doe,28,Sales,55000,john.doe@example.com
2,Jane Smith,34,,62000,jane.smith@example.com
3,Bob Johnson,,Marketing,58000,bob.johnson@example.com
4,Alice Williams,29,Marketing,,alice.williams@example.com
5,Chris Evans,45,Sales,70000,
6,Emily Davis,38,HR,67000,emily.davis@example.com
7,,41,HR,65000,michael.brown@example.com
8,Mary Clark,30,Sales,59000,mary.clark@example.com
9,James Lewis,37,,62000,james.lewis@example.com
10,Patricia Taylor,32,HR,66000,patricia.taylor@example.com
"""

# Read the data into DataFrame
df = pd.read_csv(StringIO(data))

# Check missing values before imputation
print("Missing values in 'salary' before imputation:", df['salary'].isnull().sum())

# Calculate the median of 'salary' column
median_salary = df['salary'].median()
print("Median salary:", median_salary)

# Fill missing values in 'salary' column with median
df['salary'].fillna(median_salary, inplace=True)

# Check missing values after imputation
print("Missing values in 'salary' after imputation:", df['salary'].isnull().sum())

print("\nDataFrame after median imputation on 'salary':")
print(df)


Missing values in 'salary' before imputation: 1
Median salary: 62000.0
Missing values in 'salary' after imputation: 0

DataFrame after median imputation on 'salary':
   employee_id             name   age department   salary  \
0            1         John Doe  28.0      Sales  55000.0   
1            2       Jane Smith  34.0        NaN  62000.0   
2            3      Bob Johnson   NaN  Marketing  58000.0   
3            4   Alice Williams  29.0  Marketing  62000.0   
4            5      Chris Evans  45.0      Sales  70000.0   
5            6      Emily Davis  38.0         HR  67000.0   
6            7              NaN  41.0         HR  65000.0   
7            8       Mary Clark  30.0      Sales  59000.0   
8            9      James Lewis  37.0        NaN  62000.0   
9           10  Patricia Taylor  32.0         HR  66000.0   

                         email  
0         john.doe@example.com  
1       jane.smith@example.com  
2      bob.johnson@example.com  
3   alice.williams@example.com

In [18]:
# Question: KNN Imputation
# Description: Use K-Nearest Neighbors to impute missing values in a dataset.

# Steps to follow:
# 1. Install and import required libraries: Use pip install sklearn if not already installed.
# 2. KNN Imputer: Use KNNImputer to fill in missing values.

import pandas as pd
from sklearn.impute import KNNImputer
from io import StringIO

# Sample CSV data as string (your dataset)
data = """
employee_id,name,age,department,salary,email
1,John Doe,28,Sales,55000,john.doe@example.com
2,Jane Smith,34,,62000,jane.smith@example.com
3,Bob Johnson,,Marketing,58000,bob.johnson@example.com
4,Alice Williams,29,Marketing,,alice.williams@example.com
5,Chris Evans,45,Sales,70000,
6,Emily Davis,38,HR,67000,emily.davis@example.com
7,,41,HR,65000,michael.brown@example.com
8,Mary Clark,30,Sales,59000,mary.clark@example.com
9,James Lewis,37,,62000,james.lewis@example.com
10,Patricia Taylor,32,HR,66000,patricia.taylor@example.com
"""

# Read data into DataFrame
df = pd.read_csv(StringIO(data))

# For KNN Imputer, we need to use only numeric columns or encode categorical ones
# We'll impute 'age' and 'salary' here (numerical columns)

# Extract numerical columns for imputation
num_cols = ['age', 'salary']
df_num = df[num_cols]

# Initialize KNN Imputer
imputer = KNNImputer(n_neighbors=3)

# Fit and transform the numerical data
df_num_imputed = imputer.fit_transform(df_num)

# Replace original numerical columns with imputed data
df[num_cols] = df_num_imputed

print("DataFrame after KNN imputation on 'age' and 'salary':")
print(df)


DataFrame after KNN imputation on 'age' and 'salary':
   employee_id             name        age department   salary  \
0            1         John Doe  28.000000      Sales  55000.0   
1            2       Jane Smith  34.000000        NaN  62000.0   
2            3      Bob Johnson  30.666667  Marketing  58000.0   
3            4   Alice Williams  29.000000  Marketing  60000.0   
4            5      Chris Evans  45.000000      Sales  70000.0   
5            6      Emily Davis  38.000000         HR  67000.0   
6            7              NaN  41.000000         HR  65000.0   
7            8       Mary Clark  30.000000      Sales  59000.0   
8            9      James Lewis  37.000000        NaN  62000.0   
9           10  Patricia Taylor  32.000000         HR  66000.0   

                         email  
0         john.doe@example.com  
1       jane.smith@example.com  
2      bob.johnson@example.com  
3   alice.williams@example.com  
4                          NaN  
5      emily.davis@ex

In [19]:
# Question: Detecting and Handling Missing Categorical Data
# Description: Detect missing categorical data and handle it by filling with the next frequent category.

# Steps to follow:
# 1. Identify missing values in categorical data: Use the isnull() method on categorical columns.
# 2. Impute with next frequent category: Use the mode() method to choose the next frequent category.

import pandas as pd

# Sample data
data = {
    'department': ['Sales', 'Marketing', 'Sales', 'HR', None, 'Marketing', None, 'HR', 'Sales', 'HR']
}
df = pd.DataFrame(data)

# Step 1: Detect missing values
print(f"Missing values in 'department': {df['department'].isnull().sum()}")

# Step 2: Find the next frequent category
freq_counts = df['department'].value_counts()
second_most_frequent = freq_counts.index[1]

print(f"Next frequent category to fill missing values: {second_most_frequent}")

# Step 3: Fill missing values with next frequent category
df['department'].fillna(second_most_frequent, inplace=True)

print("\nDataFrame after imputation:")
print(df)


Missing values in 'department': 2
Next frequent category to fill missing values: HR

DataFrame after imputation:
  department
0      Sales
1  Marketing
2      Sales
3         HR
4         HR
5  Marketing
6         HR
7         HR
8      Sales
9         HR


In [24]:
# Question: Predictive Modeling for Imputation
# Description: Use a predictive model to impute missing values for a particular feature using other features.

# Steps to follow:
# 1. Partition the data: Split the dataset into train and test based on the presence of missing values.
# 2. Train a model: Use a regression model to predict missing values.
# 3. Impute missing values with predictions.



In [25]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import numpy as np

# Sample data (replace with your actual CSV or DataFrame)
data = {
    'age': [25, 30, np.nan, 35, 40, np.nan, 50],
    'salary': [50000, 60000, 55000, 65000, 70000, 62000, 72000],
    'department': ['Sales', 'HR', 'HR', 'Sales', 'IT', 'IT', 'Sales']
}
df = pd.DataFrame(data)

# Step 1: Split into train (non-missing age) and test (missing age)
train_df = df[df['age'].notnull()]
test_df = df[df['age'].isnull()]

# Features and target for training
X_train = train_df.drop(columns=['age'])
y_train = train_df['age']

# Features to predict missing ages
X_test = test_df.drop(columns=['age'])

# Identify categorical and numerical columns
categorical_cols = X_train.select_dtypes(include=['object']).columns.tolist()
numerical_cols = X_train.select_dtypes(include=[np.number]).columns.tolist()

# Preprocessing pipelines for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ])

# Step 2: Create pipeline with preprocessing and model
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=100, random_state=42))
])

# Train the model
model.fit(X_train, y_train)

# Step 3: Predict missing 'age' values
predicted_ages = model.predict(X_test)

# Impute the missing values in original dataframe
df.loc[df['age'].isnull(), 'age'] = predicted_ages

print("DataFrame after predictive imputation of 'age':")
print(df)


DataFrame after predictive imputation of 'age':
     age  salary department
0  25.00   50000      Sales
1  30.00   60000         HR
2  28.00   55000         HR
3  35.00   65000      Sales
4  40.00   70000         IT
5  33.35   62000         IT
6  50.00   72000      Sales


In [23]:
# Question: Handling Time Series Data with Forward and Backward Fill
# Description: Impute missing values in a time series dataset using forward and backward fill methods.

# Steps to follow:
# 1. Sort the data: Ensure the dataset is sorted by dates.
# 2. Use fillna() with method parameter: Apply ffill() and bfill() for forward and backward fill.

import pandas as pd

# Sample time series data with missing values
data = {
    'date': ['2023-01-01', '2023-01-02', '2023-01-03', '2023-01-04', '2023-01-05'],
    'value': [10, None, None, 25, None]
}

# Create DataFrame
df = pd.DataFrame(data)

# Convert 'date' column to datetime type
df['date'] = pd.to_datetime(df['date'])

# Step 1: Sort the data by date (important for time series)
df = df.sort_values(by='date')

print("Original Data:")
print(df)

# Step 2a: Forward fill missing values
df['value_ffill'] = df['value'].fillna(method='ffill')

# Step 2b: Backward fill missing values
df['value_bfill'] = df['value'].fillna(method='bfill')

print("\nData after Forward Fill and Backward Fill:")
print(df)


Original Data:
        date  value
0 2023-01-01   10.0
1 2023-01-02    NaN
2 2023-01-03    NaN
3 2023-01-04   25.0
4 2023-01-05    NaN

Data after Forward Fill and Backward Fill:
        date  value  value_ffill  value_bfill
0 2023-01-01   10.0         10.0         10.0
1 2023-01-02    NaN         10.0         25.0
2 2023-01-03    NaN         10.0         25.0
3 2023-01-04   25.0         25.0         25.0
4 2023-01-05    NaN         25.0          NaN


  df['value_ffill'] = df['value'].fillna(method='ffill')
  df['value_bfill'] = df['value'].fillna(method='bfill')
