In [2]:
import pandas as pd
from sklearn.impute import KNNImputer
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets 
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Initialize the KNNImputer with desired number of neighbors
imputer = KNNImputer(n_neighbors=3)

# Perform the imputation row-wise by applying KNNImputer to the transposed dataframe
df2_imputed_t = pd.DataFrame(imputer.fit_transform(df2_t), columns=df2_t.columns, index=df2_t.index)

# Transpose back to the original orientation
df2_imputed = df2_imputed_t.T

# Ensure the columns and index are in the correct order after transposition
df2_imputed.columns = df2.columns
df2_imputed.index = df2.index


# Print the original, missing, and imputed dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nImputed DataFrame (df2_imputed):")
print(df2_imputed)

# Measure the performance for continuous data
mse = mean_squared_error(df1, df2_imputed)
mae = mean_absolute_error(df1, df2_imputed)
r2 = r2_score(df1.values.flatten(), df2_imputed.values.flatten())

print("Mean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared (coefficient of determination):", r2)



Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141       NaN       NaN
10      22      14.0      31.0
11      99     101.0     369.0
12      19       NaN      12.0
13      11       7.0       9.0

Imputed DataFrame (df2_imputed):
        2017      2018      201

In [6]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets 
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Perform forward fill imputation on the transposed DataFrame
df2_t_filled = df2_t.ffill(axis=0)

# Transpose the DataFrames back to their original form
df1_filled = df1_t.T
df2_filled = df2_t_filled.T

# Print the original, missing, and forward-filled dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nForward-Filled DataFrame (df2_filled):")
print(df2_filled)

# Measure the performance for continuous data
mse_ffill = mean_squared_error(df1, df2_filled)
mae_ffill = mean_absolute_error(df1, df2_filled)
r2_ffill = r2_score(df1.values.flatten(), df2_filled.values.flatten())

print("Mean Squared Error (Forward Fill):", mse_ffill)
print("Mean Absolute Error (Forward Fill):", mae_ffill)
print("R-squared (coefficient of determination) (Forward Fill):", r2_ffill)



Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141       NaN       NaN
10      22      14.0      31.0
11      99     101.0     369.0
12      19       NaN      12.0
13      11       7.0       9.0

Forward-Filled DataFrame (df2_filled):
        2017      2018   

In [7]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets 
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Perform back fill imputation on the transposed DataFrame
df2_t_filled = df2_t.bfill(axis=0)

# Transpose the DataFrames back to their original form
df1_filled = df1_t.T
df2_filled = df2_t_filled.T

# Print the original, missing, and back-filled dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nBack-Filled DataFrame (df2_filled):")
print(df2_filled)

# Measure the performance for continuous data
mse_bfill = mean_squared_error(df1, df2_filled)
mae_bfill = mean_absolute_error(df1, df2_filled)
r2_bfill = r2_score(df1.values.flatten(), df2_filled.values.flatten())

print("Mean Squared Error (Back Fill):", mse_bfill)
print("Mean Absolute Error (Back Fill):", mae_bfill)
print("R-squared (coefficient of determination) (Back Fill):", r2_bfill)



Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141       NaN       NaN
10      22      14.0      31.0
11      99     101.0     369.0
12      19       NaN      12.0
13      11       7.0       9.0

Back-Filled DataFrame (df2_filled):
        2017      2018      

ValueError: Input contains NaN.

In [8]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your datasets (replace with actual file paths)
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Perform back fill imputation on the transposed DataFrame
df2_t_filled = df2_t.bfill(axis=0)

# Ensure no NaN values remain after backfill
df2_t_filled = df2_t_filled.fillna(method='ffill', axis=0)

# Transpose the DataFrames back to their original form
df2_filled = df2_t_filled.T

# Print the original, missing, and back-filled dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nBack-Filled DataFrame (df2_filled):")
print(df2_filled)

# Measure the performance for continuous data
mse_bfill = mean_squared_error(df1, df2_filled)
mae_bfill = mean_absolute_error(df1, df2_filled)
r2_bfill = r2_score(df1.values.flatten(), df2_filled.values.flatten())

print("Mean Squared Error (Back Fill):", mse_bfill)
print("Mean Absolute Error (Back Fill):", mae_bfill)
print("R-squared (coefficient of determination) (Back Fill):", r2_bfill)



Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141       NaN       NaN
10      22      14.0      31.0
11      99     101.0     369.0
12      19       NaN      12.0
13      11       7.0       9.0

Back-Filled DataFrame (df2_filled):
        2017      2018      

  df2_t_filled = df2_t_filled.fillna(method='ffill', axis=0)


In [9]:
#Ensure No NaN Values: After backfilling, use fillna with method='ffill' to ensure that any remaining NaN values (which would occur if a column starts with NaNs) are filled using forward fill.

In [10]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load the datasets 
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Perform mean imputation
df2_t_filled = df2_t.fillna(df2_t.mean())

# Transpose the DataFrames back to their original form
df2_filled = df2_t_filled.T

# Print the original, missing, and mean-filled dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nMean-Filled DataFrame (df2_filled):")
print(df2_filled)

# Measure the performance for continuous data
mse_mean = mean_squared_error(df1, df2_filled)
mae_mean = mean_absolute_error(df1, df2_filled)
r2_mean = r2_score(df1.values.flatten(), df2_filled.values.flatten())

print("Mean Squared Error (Mean Imputation):", mse_mean)
print("Mean Absolute Error (Mean Imputation):", mae_mean)
print("R-squared (coefficient of determination) (Mean Imputation):", r2_mean)



Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141       NaN       NaN
10      22      14.0      31.0
11      99     101.0     369.0
12      19       NaN      12.0
13      11       7.0       9.0

Mean-Filled DataFrame (df2_filled):
        2017      2018      

In [11]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your datasets (replace with actual file paths)
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Perform median imputation
df2_t_filled = df2_t.fillna(df2_t.median())

# Transpose the DataFrames back to their original form
df2_filled = df2_t_filled.T

# Print the original, missing, and median-filled dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nMedian-Filled DataFrame (df2_filled):")
print(df2_filled)

# Measure the performance for continuous data
mse_median = mean_squared_error(df1, df2_filled)
mae_median = mean_absolute_error(df1, df2_filled)
r2_median = r2_score(df1.values.flatten(), df2_filled.values.flatten())

print("Mean Squared Error (Median Imputation):", mse_median)
print("Mean Absolute Error (Median Imputation):", mae_median)
print("R-squared (coefficient of determination) (Median Imputation):", r2_median)



Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141       NaN       NaN
10      22      14.0      31.0
11      99     101.0     369.0
12      19       NaN      12.0
13      11       7.0       9.0

Median-Filled DataFrame (df2_filled):
        2017      2018    