In [1]:
import copy
import pandas as pd
import numpy as np
import statsmodels.api as sm
import xgboost as xgb
from scipy.stats import chi2, poisson, norm, bernoulli
from sklearn.metrics import mean_absolute_percentage_error

# Missingness patterns

In [2]:
def generate_MAR(df, column_name, depend_column, prob=0.5):
    scaled_depend = (df[depend_column] - df[depend_column].min()) / (df[depend_column].max() - df[depend_column].min())
    prob = scaled_depend * (1 - prob)
    mask = np.random.rand(len(df)) < prob
    df[column_name] = df[column_name].mask(mask)
    return df

def generate_MCAR(df, column_name, prob=0.5):
    mask = np.random.rand(len(df)) < prob
    df[column_name] = df[column_name].mask(mask)
    return df


def generate_MNAR(df, column_name, prob=0.5):
    mean = df[column_name].mean()
    mask = np.random.rand(len(df)) < (prob * (df[column_name] > mean))
    df[column_name] = df[column_name].mask(mask)
    return df

#Imputation methods


In [3]:
import numpy as np
import statsmodels.api as sm

def regression_impute(sample, predictors, response, weight_col=None):
    observed = sample.dropna(subset=[response])
    missing = sample[sample[response].isna()]

    X_obs = sm.add_constant(observed[predictors].values)
    y_obs = observed[response].values

    if weight_col and weight_col in sample.columns:
        w_obs = observed[weight_col].values
        model = sm.WLS(y_obs, X_obs, weights=w_obs).fit()
    else:
        model = sm.OLS(y_obs, X_obs).fit()

    X_missing = sm.add_constant(missing[predictors].values)
    y_imputed = model.predict(X_missing)
    sample.loc[missing.index, response] = y_imputed

def knn_impute(sample, predictors, response, weight_col=None, k=5):
    observed = sample.dropna(subset=[response])
    missing = sample[sample[response].isna()]

    X_obs = observed[predictors].values
    y_obs = observed[response].values

    if weight_col and weight_col in sample.columns:
        w_obs = observed[weight_col].values
    else:
        w_obs = np.ones(len(y_obs))

    for idx, row in missing.iterrows():
        X_missing = row[predictors].values
        dists = np.linalg.norm(X_obs - X_missing, axis=1)
        nearest_indices = np.argsort(dists)[:k]

        nearest_y = y_obs[nearest_indices]
        nearest_weights = w_obs[nearest_indices]

        y_imputed = np.average(nearest_y, weights=nearest_weights)

        sample.at[idx, response] = y_imputed



def xgboost_impute(sample, response, weight_col=None,
                   n_estimators=50, max_depth=3, learning_rate=0.1):
    observed = sample.dropna(subset=[response])
    missing = sample[sample[response].isna()]

    X_obs = observed.drop(columns=[response])
    y_obs = observed[response]

    if weight_col and weight_col in observed.columns:
        w_obs = observed[weight_col]
    else:
        w_obs = None

    model = xgb.XGBRegressor(
        objective="reg:squarederror",
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate
    )

    model.fit(X_obs, y_obs, sample_weight=w_obs)

    X_missing = missing.drop(columns=[response])
    imputed_values = model.predict(X_missing)

    sample.loc[sample[response].isna(), response] = imputed_values
    return sample

#Study simulation

In [4]:
import numpy as np
import pandas as pd
from scipy.stats import beta, gamma, poisson, norm, chi2, expon, lognorm, uniform, binom

np.random.seed(1)

population_size = 10000
num_populations = 10
sample_size = 5000

populations = []
samples = []
MCAR_samples = []
MAR_samples = []
MNAR_samples = []

for pop in range(num_populations):
    X1 = np.random.normal(1, 1.5, population_size)
    X2 = beta.rvs(2, 2, size=population_size)
    X3 = 1.5 * gamma.rvs(4, scale=1.5, size=population_size)
    X4 = poisson.rvs(0.6, size=population_size)
    X5 = np.random.normal(1, 2, population_size)
    X6 = chi2.rvs(df=3, size=population_size)
    X7 = expon.rvs(scale=1.0, size=population_size)
    X8 = lognorm.rvs(s=0.5, scale=np.exp(0), size=population_size)
    X9 = uniform.rvs(loc=0, scale=1, size=population_size)
    X10 = binom.rvs(n=1, p=0.3, size=population_size)

    # Standardize all predictors
    predictors_raw = [X1, X2, X3, X4, X5, X6, X7, X8, X9, X10]
    predictors_std = [(x - x.mean()) / x.std() for x in predictors_raw]
    X1, X2, X3, X4, X5, X6, X7, X8, X9, X10 = predictors_std

    # Generate continuous survey variables Y1 and Y2 using all predictors
    Y1 = (
        1
        + 1.5*X1 * 1.2*X2 + 1.8*X3**2 + 1.0*X4 + 0.5*X5
        + 1.1*X6 + 0.9*X7**3 + 1.3*X8 + 0.8*X9 + 0.6*X10
        + norm.rvs(0, 1, size=population_size)
    )

    Y2 = (
        2
        + 1.0*X1 + 1.4*X2 + 1.7*X3 + 0.6*X4 + 0.7*X5
        + 1.0*X6 + 1.2*X7 + 1.1*X8 + 0.9*X9 + 0.5*X10
        + norm.rvs(0, 1, size=population_size)
    )

    # Create DataFrame
    population = pd.DataFrame({
        "X1": X1, "X2": X2, "X3": X3, "X4": X4, "X5": X5,
        "X6": X6, "X7": X7, "X8": X8, "X9": X9, "X10": X10,
        "Y1": Y1, "Y2": Y2
    })
    population_totals = population[["Y1", "Y2"]].sum(axis=0)
    populations.append(population)

    # Simple random sampling without replacement
    sample_indices = np.random.choice(population.index, size=sample_size, replace=False)
    sample = population.loc[sample_indices].copy()

    # Store the original sample
    samples.append(sample)

# Updated predictor and response lists
predictors = ['X1', 'X2', 'X3', 'X4', 'X5', 'X6', 'X7', 'X8', 'X9', 'X10']
responses = ['Y1', 'Y2']


# Generate MAR versions of the samples
for sample in samples:
    sample_mcar = sample.copy()
    sample_mar = sample.copy()
    sample_mnar = sample.copy()
    for Y in responses:
        sample_mcar = generate_MCAR(sample_mcar, Y, prob = 0.5)
        sample_mar = generate_MAR(sample_mar, Y, predictors[0], prob = 0.3)
        sample_mnar = generate_MNAR(sample_mnar, Y, prob = 0.3)
    MCAR_samples.append(sample_mcar)
    MAR_samples.append(sample_mar)
    MNAR_samples.append(sample_mnar)

MNAR_samples[0]

Unnamed: 0,X1,X2,X3,X4,X5,X6,X7,X8,X9,X10,Y1,Y2
8001,-1.245272,0.221468,-0.316009,1.790383,-0.944173,0.416511,0.360427,-0.338156,-1.519109,-0.670750,-0.062315,-1.449466
8376,0.519689,0.063045,-0.196507,-0.764752,-1.657840,1.417921,-0.350775,-0.194769,-0.278229,1.490868,3.180516,2.767672
31,-0.407020,1.880996,-1.184623,0.512816,0.275122,-0.370714,0.022868,-0.868905,-1.176304,1.490868,-0.721412,-0.195494
6807,-1.272893,0.006901,0.959576,0.512816,1.138544,-0.709638,-0.942857,-1.453623,-1.651033,-0.670750,-0.539831,-1.461670
3807,-1.157754,0.063455,-1.109976,0.512816,-0.407259,-0.901299,0.361380,-0.674990,0.426234,-0.670750,1.963904,-1.953041
...,...,...,...,...,...,...,...,...,...,...,...,...
639,-0.417532,0.169174,-0.928346,0.512816,0.165602,0.165496,0.657754,-0.608932,-0.022926,1.490868,6.387652,0.824746
3782,-0.614073,-1.590622,0.052474,1.790383,-1.006717,-0.866121,-0.439792,-0.868051,0.982137,-0.670750,-0.106673,-3.802211
8476,1.004606,0.139095,-1.354475,1.790383,0.690187,0.830124,0.310248,1.970981,-0.419749,-0.670750,9.676107,5.755768
8298,-0.439296,-0.365211,-0.701570,-0.764752,-0.196449,-0.289082,-0.726256,-0.914082,-0.604336,-0.670750,-1.308972,-3.859981


##MCAR

In [9]:
mcar_im_samples = []
len(mcar_im_samples)

0

###Regression Imputation

In [10]:
regression_samples = copy.deepcopy(MCAR_samples)
for sample in regression_samples:
    for response in responses:
        regression_impute(sample, predictors, response)
mcar_im_samples.append(regression_samples)

###KNN1 imputation

In [11]:
knn1_samples = copy.deepcopy(MCAR_samples)
import copy
from sklearn.impute import KNNImputer
import pandas as pd

knn1_imputed_samples = []
for sample in knn1_samples:
    imputer = KNNImputer(n_neighbors=2)
    imputed = imputer.fit_transform(sample)
    imputed_df = pd.DataFrame(imputed, columns=sample.columns, index=sample.index)
    knn1_imputed_samples.append(imputed_df)

mcar_im_samples.append(knn1_imputed_samples)

###KNN2 imputation

In [12]:
knn2_samples = copy.deepcopy(MCAR_samples)

knn2_imputed_samples = []
for sample in knn2_samples:
    imputer = KNNImputer(n_neighbors=5)
    imputed = imputer.fit_transform(sample)
    imputed_df = pd.DataFrame(imputed, columns=sample.columns, index=sample.index)
    knn2_imputed_samples.append(imputed_df)

mcar_im_samples.append(knn2_imputed_samples)

###XGboost1 imputation

In [13]:
xgboost1_samples = copy.deepcopy(MCAR_samples)
for sample in xgboost1_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=50, max_depth=3, learning_rate=0.3)
mcar_im_samples.append(xgboost1_samples)

###XGboost2 imputation

In [14]:
xgboost2_samples = copy.deepcopy(MCAR_samples)
for sample in xgboost2_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=75, max_depth=10, learning_rate=0.2)
mcar_im_samples.append(xgboost2_samples)

###XGboost3 imputation

In [15]:
xgboost3_samples = copy.deepcopy(MCAR_samples)
for sample in xgboost3_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=100, max_depth=20, learning_rate=0.1)
mcar_im_samples.append(xgboost3_samples)

##MAR

In [16]:
mar_im_samples = []
len(mar_im_samples)

0

###Regression Imputation

In [17]:
regression_samples = copy.deepcopy(MAR_samples)
for sample in regression_samples:
    for response in responses:
        regression_impute(sample, predictors, response)
mar_im_samples.append(regression_samples)

###KNN1 imputation

In [18]:
knn1_samples = copy.deepcopy(MAR_samples)
import copy
from sklearn.impute import KNNImputer
import pandas as pd

knn1_imputed_samples = []
for sample in knn1_samples:
    imputer = KNNImputer(n_neighbors=2)
    imputed = imputer.fit_transform(sample)
    imputed_df = pd.DataFrame(imputed, columns=sample.columns, index=sample.index)
    knn1_imputed_samples.append(imputed_df)

mar_im_samples.append(knn1_imputed_samples)

###KNN2 imputation

In [19]:
knn2_samples = copy.deepcopy(MCAR_samples)
knn2_imputed_samples = []

for sample in knn2_samples:
    imputer = KNNImputer(n_neighbors=5)
    imputed = imputer.fit_transform(sample)
    imputed_df = pd.DataFrame(imputed, columns=sample.columns, index=sample.index)
    knn2_imputed_samples.append(imputed_df)

mar_im_samples.append(knn2_imputed_samples)

###XGboost1 imputation

In [20]:
xgboost1_samples = copy.deepcopy(MAR_samples)
for sample in xgboost1_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=50, max_depth=3, learning_rate=0.3)
mar_im_samples.append(xgboost1_samples)

###XGboost2 imputation

In [21]:
xgboost2_samples = copy.deepcopy(MAR_samples)
for sample in xgboost2_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=75, max_depth=10, learning_rate=0.2)
mar_im_samples.append(xgboost2_samples)

###XGboost3 imputation

In [22]:
xgboost3_samples = copy.deepcopy(MAR_samples)
for sample in xgboost3_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=100, max_depth=50, learning_rate=0.1)
mar_im_samples.append(xgboost3_samples)

##MNAR

In [23]:
mnar_im_samples = []

###Regression Imputation

In [24]:
knn_samples = copy.deepcopy(MNAR_samples)
knn_imputed_samples = []

for sample in knn_samples:
    imputer = KNNImputer(n_neighbors=5)
    imputed = imputer.fit_transform(sample)
    imputed_df = pd.DataFrame(imputed, columns=sample.columns, index=sample.index)
    knn_imputed_samples.append(imputed_df)

mnar_im_samples.append(knn_imputed_samples)

###KNN1 imputation

In [25]:
knn1_samples = copy.deepcopy(MNAR_samples)
import copy
from sklearn.impute import KNNImputer
import pandas as pd

knn1_imputed_samples = []
for sample in knn_samples:
    imputer = KNNImputer(n_neighbors=2)
    imputed = imputer.fit_transform(sample)
    imputed_df = pd.DataFrame(imputed, columns=sample.columns, index=sample.index)
    knn1_imputed_samples.append(imputed_df)

mnar_im_samples.append(knn1_imputed_samples)

###KNN2 imputation

In [26]:
knn2_samples = copy.deepcopy(MNAR_samples)

knn2_imputed_samples = []
for sample in knn_samples:
    imputer = KNNImputer(n_neighbors=5)
    imputed = imputer.fit_transform(sample)
    imputed_df = pd.DataFrame(imputed, columns=sample.columns, index=sample.index)
    knn2_imputed_samples.append(imputed_df)

mnar_im_samples.append(knn2_imputed_samples)

###XGboost1 imputation

In [27]:
xgboost1_samples = copy.deepcopy(MNAR_samples)
for sample in xgboost1_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=50, max_depth=3, learning_rate=0.3)
mnar_im_samples.append(xgboost1_samples)

###XGboost2 imputation

In [28]:
xgboost2_samples = copy.deepcopy(MNAR_samples)
for sample in xgboost2_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=75, max_depth=10, learning_rate=0.2)
mnar_im_samples.append(xgboost2_samples)

###XGboost3 imputation

In [29]:
xgboost3_samples = copy.deepcopy(MNAR_samples)
for sample in xgboost3_samples:
    for response in responses:
        xgboost_impute(sample, response, n_estimators=100, max_depth=20, learning_rate=0.1)
mnar_im_samples.append(xgboost3_samples)

##Results

###Relative bias(RB)

In [31]:
import numpy as np
import pandas as pd

def compute_RB(true_value, imputed_total_estim):
    RB = np.abs((imputed_total_estim - true_value) / true_value) * 100
    return RB

# Store detailed stats per pattern
detailed_stats = {
    "MCAR": {},
    "MAR": {},
    "MNAR": {}
}

# Initialize results dictionary for final summary
results = {
    "Regression": [],
    "K-NN1": [],
    "K-NN2": [],
    "XGboost1": [],
    "XGboost2": [],
    "XGboost3": []
}

# Iterate through missing data patterns
for missing_pattern, m_im_samples in zip(["MCAR", "MAR", "MNAR"],
                                         [mcar_im_samples, mar_im_samples, mnar_im_samples]):
    pattern_stats = {}
    for method_name, imputed_samples in zip(["Regression", "K-NN1", "K-NN2", "XGboost1", "XGboost2", "XGboost3"], m_im_samples):
        RB_values = []
        for sample, imputed_sample in zip(samples, imputed_samples):
            for response in responses:
                true_value = population_totals[response]
                imputed_values = imputed_sample[response].values
                imputed_total_estim = sum(imputed_values) * population_size / sample_size
                RB_values.append(compute_RB(true_value, imputed_total_estim))

        # Save summary stats for this method
        RB_series = pd.Series(RB_values)
        pattern_stats[method_name] = {
            "Min": RB_series.min(),
            "25%": RB_series.quantile(0.25),
            "Mean": RB_series.mean(),
            "75%": RB_series.quantile(0.75),
            "Max": RB_series.max()
        }

        # Also add mean to final summary table
        results[method_name].append(RB_series.mean())

    # Convert pattern stats to DataFrame and store
    detailed_stats[missing_pattern] = pd.DataFrame(pattern_stats).T[
        ["Min", "25%", "Mean", "75%", "Max"]
    ]
    detailed_stats[missing_pattern].index.name = "Method"
    detailed_stats[missing_pattern].columns.name = f"{missing_pattern} RB Summary (%)"

# Create final mean-only table
df_summary = pd.DataFrame(results, index=["MCAR", "MAR", "MNAR"]).T
df_summary.columns.name = "Missing Pattern"
df_summary.index.name = "Method"

# --- Display tables ---
for pattern in ["MCAR", "MAR", "MNAR"]:
    print(f"\nDetailed RB Summary for {pattern}:")
    print(detailed_stats[pattern].round(2))

print("\nMC-APRB (%):")
print(df_summary.round(2))



Detailed RB Summary for MCAR:
MCAR RB Summary (%)   Min   25%   Mean    75%    Max
Method                                              
Regression           0.12  0.81   3.54   4.39  11.31
K-NN1                5.63  8.65  11.14  12.99  21.99
K-NN2                7.52  9.96  12.71  14.73  23.93
XGboost1             0.19  1.02   3.14   3.48  15.44
XGboost2             0.05  0.46   3.15   4.02  17.76
XGboost3             0.01  0.42   3.06   3.60  19.32

Detailed RB Summary for MAR:
MAR RB Summary (%)   Min   25%   Mean    75%    Max
Method                                             
Regression          0.08  0.76   2.12   2.97   8.72
K-NN1               4.17  7.63   8.95   9.67  16.80
K-NN2               7.52  9.96  12.71  14.73  23.93
XGboost1            0.43  1.36   2.72   3.59   7.70
XGboost2            0.12  1.10   2.43   2.89   7.54
XGboost3            0.70  1.32   2.38   3.09   6.62

Detailed RB Summary for MNAR:
MNAR RB Summary (%)   Min   25%   Mean    75%    Max
Method         

###Root Mean Squared Error (RMSE)

In [32]:
import numpy as np
import pandas as pd

def compute_RMSE(true_value, imputed_total_estim):
    RMSE = np.sqrt((imputed_total_estim - true_value) ** 2)
    return RMSE

# Store detailed stats per pattern
detailed_rmse_stats = {
    "MCAR": {},
    "MAR": {},
    "MNAR": {}
}

# Initialize results dictionary for final summary
rmse_results = {
    "Regression": [],
    "K-NN1": [],
    "K-NN2": [],
    "XGboost1": [],
    "XGboost2": [],
    "XGboost3": []
}

# Iterate through missing data patterns
for missing_pattern, m_im_samples in zip(["MCAR", "MAR", "MNAR"],
                                         [mcar_im_samples, mar_im_samples, mnar_im_samples]):
    pattern_stats = {}
    for method_name, imputed_samples in zip(["Regression", "K-NN1", "K-NN2", "XGboost1", "XGboost2", "XGboost3"], m_im_samples):
        RMSE_values = []
        for sample, imputed_sample in zip(samples, imputed_samples):
            for response in responses:
                true_value = population_totals[response]
                imputed_values = imputed_sample[response].values
                imputed_total_estim = sum(imputed_values) * population_size / sample_size
                RMSE_values.append(compute_RMSE(true_value, imputed_total_estim))

        # Save summary stats for this method
        RMSE_series = pd.Series(RMSE_values)
        pattern_stats[method_name] = {
            "Min": RMSE_series.min(),
            "25%": RMSE_series.quantile(0.25),
            "Mean": RMSE_series.mean(),
            "75%": RMSE_series.quantile(0.75),
            "Max": RMSE_series.max()
        }

        # Also add mean to final summary table
        rmse_results[method_name].append(RMSE_series.mean())

    # Convert pattern stats to DataFrame and store
    detailed_rmse_stats[missing_pattern] = pd.DataFrame(pattern_stats).T[
        ["Min", "25%", "Mean", "75%", "Max"]
    ]
    detailed_rmse_stats[missing_pattern].index.name = "Method"
    detailed_rmse_stats[missing_pattern].columns.name = f"{missing_pattern} RMSE Summary"

# Create final mean-only table
rmse_summary = pd.DataFrame(rmse_results, index=["MCAR", "MAR", "MNAR"]).T
rmse_summary.columns.name = "Missing Pattern"
rmse_summary.index.name = "Method"

# --- Display tables ---
for pattern in ["MCAR", "MAR", "MNAR"]:
    print(f"\nDetailed RMSE Summary for {pattern}:")
    print(detailed_rmse_stats[pattern].round(2))

print("\nMean RMSE Summary:")
print(rmse_summary.round(2))



Detailed RMSE Summary for MCAR:
MCAR RMSE Summary      Min      25%     Mean      75%       Max
Method                                                         
Regression           52.07   294.08  1368.56  1996.61   5148.46
K-NN1              1132.02  1924.31  3902.60  5509.39  10011.60
K-NN2              1513.02  2142.89  4465.46  6706.57  10893.55
XGboost1             37.68   348.99  1198.62  1339.39   7028.85
XGboost2             17.28   103.79  1256.73  1792.12   8084.81
XGboost3              5.20    84.06  1245.05  1243.73   8794.90

Detailed RMSE Summary for MAR:
MAR RMSE Summary      Min      25%     Mean      75%       Max
Method                                                        
Regression          36.99   232.15   748.49   722.31   3970.83
K-NN1              838.22  1712.03  3038.58  4268.34   7646.40
K-NN2             1513.02  2142.89  4465.46  6706.57  10893.55
XGboost1            86.93   404.49   950.59  1005.34   3504.11
XGboost2            24.72   321.75   851.59  

###Boxplot

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Prepare data for plotting
rmse_plot_data = []

# Loop again through the same logic to gather RMSEs for each method and pattern
for pattern_name, m_im_samples in zip(["MCAR", "MAR", "MNAR"],
                                      [mcar_im_samples, mar_im_samples, mnar_im_samples]):
    for method_name, imputed_samples in zip(["Regression", "K-NN1", "K-NN2", "XGboost1", "XGboost2", "XGboost3"], m_im_samples):
        for sample, imputed_sample in zip(samples, imputed_samples):
            for response in responses:
                true_value = population_totals[response]
                imputed_values = imputed_sample[response].values
                imputed_total_estim = sum(imputed_values) * population_size / sample_size
                rmse = compute_RMSE(true_value, imputed_total_estim)
                rmse_plot_data.append({
                    "Method": method_name,
                    "RMSE": rmse,
                    "Pattern": pattern_name
                })

# Convert to DataFrame for plotting
rmse_df = pd.DataFrame(rmse_plot_data)

# Set seaborn style
sns.set(style="whitegrid")

# Create the boxplot
plt.figure(figsize=(12, 6))
sns.boxplot(data=rmse_df, x="Method", y="RMSE", hue="Pattern", palette="Set2")

# Improve plot formatting
plt.title("RMSE by Imputation Method and Missingness Pattern")
plt.xticks(rotation=45)
plt.ylabel("RMSE")
plt.xlabel("Imputation Method")
plt.legend(title="Missingness Pattern")
plt.tight_layout()

plt.show()


#Real data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

MessageError: Error: credential propagation was unsuccessful

In [None]:
import pandas as pd

farming_df = pd.read_csv('/content/drive/My Drive/Диплом/farming.csv', sep=';',dtype= str)
farming_df = farming_df.replace({',': '.'}, regex=True)
farming_df = farming_df.apply(pd.to_numeric, errors='coerce')
predictors = ['EFF10', 'FOAGR', 'TACF1']
response = 'RESFI'
farming_df = farming_df[predictors + [response]]

farming_df.head()

Unnamed: 0,EFF10,FOAGR,TACF1,RESFI
0,0,0,592548.09,269.41
1,0,2,333902.0,1698.0
2,0,2,1436725.0,-9330.0
3,0,3,292576.2,124.84
4,5,1,2531437.0,-12046.0


In [None]:
farming_df = generate_MCAR(farming_df, response, prob=0.5)
farming_df

Unnamed: 0,EFF10,FOAGR,TACF1,RESFI
0,0,0,592548.09,
1,0,2,333902.00,1698.00
2,0,2,1436725.00,
3,0,3,292576.20,
4,5,1,2531437.00,
...,...,...,...,...
7317,0,2,543107.14,
7318,0,2,225292.53,
7319,1,2,202261.71,-1102.03
7320,4,3,670874.58,-4258.09


In [None]:
xgboost_impute(farming_df, response)

Unnamed: 0,EFF10,FOAGR,TACF1,RESFI
0,0,0,592548.09,-649.916077
1,0,2,333902.00,1698.000000
2,0,2,1436725.00,-9051.319336
3,0,3,292576.20,-1182.251831
4,5,1,2531437.00,-13672.304688
...,...,...,...,...
7317,0,2,543107.14,-936.613220
7318,0,2,225292.53,-743.067078
7319,1,2,202261.71,-1102.030000
7320,4,3,670874.58,-4258.090000


##Results