In [None]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

def miss_forest(df, startIndex=0, numCols=[], partitions=10, rounds=5):
    catCols = [col for col in df.columns if col not in numCols]
    endIndex = max(df.index)
    X_new = df.copy()
    X_new.loc[:, numCols] = X_new[numCols].fillna(X_new[numCols].mean())
    X_new.loc[:, catCols] = X_new[catCols].fillna(X_new[catCols].mode().T.to_dict()[0])

    round = 0
    evals = {}
    l = 0
    k = df.loc[startIndex:].isna().sum().sort_values().index

    while l <= 0 and round <= rounds:
        evals[round] = {}
        evals[round]['cat'] = {}
        evals[round]['num'] = {}
        lCatNum = 0
        lCatDen = 0
        lNumNum = 0
        lNumDen = 0
        X_old = X_new.copy()

        for s in k:
            category = 'num' if s in numCols else 'cat'
            evals[round][category][s] = {}
            evals[round][category][s]['true'] = []
            evals[round][category][s]['pred'] = []
            obsCols = X_old.columns[X_old.columns != s]
            obsIndex = df[df[s].notna()].index
            misIndex = df[df[s].isna()].index

            if len(misIndex) <= 0 or len(obsIndex) <= 0:
                    continue
            print(f"{s} ({category}): obs{len(obsIndex)} mis{len(misIndex)}")

            for range in pd.interval_range(start=startIndex, end=endIndex, periods=partitions):
                print(".", end="")
                i_obs = obsIndex[obsIndex < range.left]
                i_mis = misIndex[(misIndex <= range.right) & (misIndex >= range.left)]
                i_eval = obsIndex[(obsIndex <= range.right) & (obsIndex >= range.left)]
                assert df.loc[i_obs, s].isna().sum() == 0
                assert df.loc[i_eval, s].isna().sum() == 0
                assert df.loc[i_mis, s].isna().sum() == len(df.loc[i_mis, s])
                
                if len(i_obs) <= 0 or len(i_mis) <= 0:
                    continue

                if s in numCols:
                    estimator = RandomForestRegressor(n_estimators=100, random_state=0, max_depth=4, oob_score=False, n_jobs=-1, warm_start=False)
                else:
                    estimator = RandomForestClassifier(n_estimators=100, random_state=0, max_depth=4, oob_score=False, n_jobs=-1, warm_start=False)
            
                estimator.fit(X_old.loc[i_obs, obsCols], X_old.loc[i_obs, s])
                X_new.loc[i_mis, s] = estimator.predict(X_old.loc[i_mis, obsCols])
                evals[round][category][s]['true'].append(X_old.loc[i_eval, s])

                if s in numCols:
                    lNumNum += sum((X_new.loc[i_mis, s]-X_old.loc[i_mis, s])**2/(X_old.loc[i_mis, s])**2)
                    lNumDen += sum(X_old.loc[i_mis, s]**2)
                    evals[round][category][s]['pred'].append(estimator.predict(X_old.loc[i_eval, obsCols]))
                else:
                    lCatNum += sum(X_new.loc[i_mis, s]==X_old.loc[i_mis, s])
                    lCatDen += len(i_mis)
                    evals[round][category][s]['pred'].append(estimator.predict_proba(X_old.loc[i_eval, obsCols]))
            print(":")
        l = (lCatNum/lCatDen)*(lNumNum/lNumDen)
        round += 1
    return X_new, evals