In [4]:
import pandas as pd
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

# Load your datasets (replace with actual file paths)
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Convert continuous data into categorical by discretizing into bins
# You may need to adjust the number of bins and their widths based on your data
num_bins = 10
bin_width = (df1.max() - df1.min()) / num_bins
bins = [df1.min() + i * bin_width for i in range(num_bins + 1)]

# Print the bins before sorting to debug
print("Bins before sorting:", bins)

# Ensure that the bins are sorted in ascending order
bins = sorted(bins)

# Print the bins after sorting to debug
print("Bins after sorting:", bins)

# Discretize the data
df1_discretized = pd.DataFrame()
df2_discretized = pd.DataFrame()

for col in df1.columns:
    df1_discretized[col] = pd.cut(df1[col], bins=bins, labels=[i for i in range(num_bins)])
    df2_discretized[col] = pd.cut(df2[col], bins=bins, labels=[i for i in range(num_bins)])

# Create a Naive Bayes classifier
classifier = GaussianNB()

# Impute missing values using Naive Bayes
for col in df2_discretized.columns:
    missing_indices = df2_discretized[col].isnull()
    if missing_indices.any():
        # Fit the classifier on complete data
        classifier.fit(df1_discretized[~missing_indices], df2_discretized[col][~missing_indices])
        # Predict missing values
        df2_discretized[col][missing_indices] = classifier.predict(df1_discretized[missing_indices])

# Evaluate performance using accuracy (optional for imputation)
# Here, we are comparing the imputed values with the original missing values
accuracy = accuracy_score(df2_discretized[df2.isnull()].values.flatten(), df2[df2.isnull()].values.flatten())

# Print the imputed dataframe
print("Imputed DataFrame (df2_imputed):")
print(df2_discretized)

# Print accuracy (optional for imputation)
print("Accuracy:", accuracy)



Bins before sorting: [2017    11.0
2018     5.0
2019     9.0
dtype: float64, 2017    19237.3
2018    20123.6
2019    26603.5
dtype: float64, 2017    38463.6
2018    40242.2
2019    53198.0
dtype: float64, 2017    57689.9
2018    60360.8
2019    79792.5
dtype: float64, 2017     76916.2
2018     80479.4
2019    106387.0
dtype: float64, 2017     96142.5
2018    100598.0
2019    132981.5
dtype: float64, 2017    115368.8
2018    120716.6
2019    159576.0
dtype: float64, 2017    134595.1
2018    140835.2
2019    186170.5
dtype: float64, 2017    153821.4
2018    160953.8
2019    212765.0
dtype: float64, 2017    173047.7
2018    181072.4
2019    239359.5
dtype: float64, 2017    192274.0
2018    201191.0
2019    265954.0
dtype: float64]


ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().