In [40]:
# pip install miceforest==5.7.0 pyproject-toml==0.0.10 lightgbm==4.3.0 pandas==2.2.1 scikit-learn==1.4.1.post1


In [72]:
import pandas as pd
import miceforest as mf
from sklearn.preprocessing import LabelEncoder
import re
import numpy as np

Loading Data

In [73]:
df = pd.read_csv("/Users/sanjju/projects/datasets/Students-attendance-data.csv")

Data Preprocessing

In [74]:
# Convert 'Enrolled', 'Absent', 'Present', and 'Released' columns to numeric
df[['Enrolled', 'Absent', 'Present', 'Released']] = df[['Enrolled', 'Absent', 'Present', 'Released']].apply(pd.to_numeric, errors='coerce')

# Define the probability of NaN for each column
nan_prob = 0.05  # Adjust as needed

for col in df.columns[1:]:  # Starting from the second column
    mask = np.random.rand(len(df)) < nan_prob
    df.loc[mask, col] = np.nan

print(df)

       School DBN        Date  Enrolled  Absent  Present  Released
0          01M015  20180905.0     172.0    19.0    153.0       0.0
1          01M015  20180906.0     171.0    17.0    154.0       0.0
2          01M015  20180907.0     172.0    14.0    158.0       0.0
3          01M015  20180912.0     173.0     7.0    166.0       0.0
4          01M015  20180913.0     173.0     9.0    164.0       0.0
...           ...         ...       ...     ...      ...       ...
277148     79X695  20190620.0     230.0    46.0    136.0      48.0
277149     79X695  20190621.0     226.0    53.0    128.0      45.0
277150     79X695  20190624.0     226.0    42.0    130.0      54.0
277151     79X695  20190625.0     226.0    56.0    127.0      43.0
277152     79X695  20190626.0     216.0    27.0     68.0     121.0

[277153 rows x 6 columns]


In [75]:
df.isnull().sum()

School DBN        0
Date          13852
Enrolled      13931
Absent        14003
Present       13855
Released      13852
dtype: int64

In [76]:
df.dtypes


School DBN     object
Date          float64
Enrolled      float64
Absent        float64
Present       float64
Released      float64
dtype: object

MICE Imputation

In [77]:
# Create ImputationKernel starting from the second column
kernel = mf.ImputationKernel(
    df.iloc[:, 1:],  # Select columns from the second column onward
    save_all_iterations=True,
    random_state=1991
)

# Perform MICE imputation
kernel.mice(2)

# Retrieve the imputed dataset
imputed_df = kernel.complete_data()


Combining data

In [78]:
# Combine the DBN column with the imputed data
imputed_df = pd.concat([df[['School DBN']], imputed_df], axis=1)

In [79]:
imputed_df

Unnamed: 0,School DBN,Date,Enrolled,Absent,Present,Released
0,01M015,20180905.0,172.0,19.0,153.0,0.0
1,01M015,20180906.0,171.0,17.0,154.0,0.0
2,01M015,20180907.0,172.0,14.0,158.0,0.0
3,01M015,20180912.0,173.0,7.0,166.0,0.0
4,01M015,20180913.0,173.0,9.0,164.0,0.0
...,...,...,...,...,...,...
277148,79X695,20190620.0,230.0,46.0,136.0,48.0
277149,79X695,20190621.0,226.0,53.0,128.0,45.0
277150,79X695,20190624.0,226.0,42.0,130.0,54.0
277151,79X695,20190625.0,226.0,56.0,127.0,43.0


In [80]:
imputed_df.isnull().sum()

School DBN    0
Date          0
Enrolled      0
Absent        0
Present       0
Released      0
dtype: int64