In [1]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your datasets (replace with actual file paths)
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Check for missing values
missing_columns = df2_t.columns[df2_t.isnull().any()]
if missing_columns.empty:
    print("No missing values found.")
else:
    print("Columns with missing values:", missing_columns)

    # Iterate over each column with missing values
    for col in missing_columns:
        # Compute the mean of the column (ignoring NaN values)
        mean_value = df2_t[col].mean()
        # Fill the missing values with the mean
        df2_t[col].fillna(mean_value, inplace=True)

# Transpose back to the original orientation
df2_imputed = df2_t.T

# Ensure the columns and index are in the correct order after transposition
df2_imputed.columns = df2.columns
df2_imputed.index = df2.index

# Round the values in the imputed DataFrame to not show decimal places
df2_imputed = df2_imputed.round(0)  # Change the argument to adjust decimal places

# Measure the performance for continuous data
mse = mean_squared_error(df1, df2_imputed)
mae = mean_absolute_error(df1, df2_imputed)
r2 = r2_score(df1.values.flatten(), df2_imputed.values.flatten())

print("\nMean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared (coefficient of determination):", r2)

# Print the original, missing, and imputed dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nImputed DataFrame (df2_imputed):")
print(df2_imputed)


Columns with missing values: Index([3, 4, 7, 9, 12], dtype='int64')

Mean Squared Error: 202350.7380952381
Mean Absolute Error: 141.11904761904762
R-squared (coefficient of determination): 0.999936088872679

Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141   

In [2]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your datasets (replace with actual file paths)
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Transpose the dataframes to switch rows and columns
df1_t = df1.T
df2_t = df2.T

# Check for missing values
missing_columns = df2_t.columns[df2_t.isnull().any()]
if missing_columns.empty:
    print("No missing values found.")
else:
    print("Columns with missing values:", missing_columns)

    # Iterate over each column with missing values
    for col in missing_columns:
        # Compute the median of the column (ignoring NaN values)
        median_value = df2_t[col].median()
        # Fill the missing values with the median
        df2_t[col].fillna(median_value, inplace=True)

# Transpose back to the original orientation
df2_imputed = df2_t.T

# Ensure the columns and index are in the correct order after transposition
df2_imputed.columns = df2.columns
df2_imputed.index = df2.index

# Round the values in the imputed DataFrame to not show decimal places
df2_imputed = df2_imputed.round(0)  # Change the argument to adjust decimal places

# Measure the performance for continuous data
mse = mean_squared_error(df1, df2_imputed)
mae = mean_absolute_error(df1, df2_imputed)
r2 = r2_score(df1.values.flatten(), df2_imputed.values.flatten())

print("\nMean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared (coefficient of determination):", r2)

# Print the original, missing, and imputed dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nImputed DataFrame (df2_imputed):")
print(df2_imputed)


Columns with missing values: Index([3, 4, 7, 9, 12], dtype='int64')

Mean Squared Error: 202350.7380952381
Mean Absolute Error: 141.11904761904762
R-squared (coefficient of determination): 0.999936088872679

Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141   

In [4]:
import pandas as pd
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Load your datasets (replace with actual file paths)
df1 = pd.read_csv('data.csv')
df2 = pd.read_csv('missing_value.csv')

# Drop rows with missing values in df2
df2_dropped = df2.dropna()

# Align df1 to have the same indices as df2_dropped for comparison
df1_aligned = df1.loc[df2_dropped.index]

# Measure the performance for continuous data
mse = mean_squared_error(df1_aligned, df2_dropped)
mae = mean_absolute_error(df1_aligned, df2_dropped)
r2 = r2_score(df1_aligned.values.flatten(), df2_dropped.values.flatten())

print("\nMean Squared Error:", mse)
print("Mean Absolute Error:", mae)
print("R-squared (coefficient of determination):", r2)

# Print the original, missing, and imputed dataframes
print("\nOriginal DataFrame (df1):")
print(df1)

print("\nDataFrame with Missing Values (df2):")
print(df2)

print("\nDataFrame after Dropping Rows with Missing Values (df2_dropped):")
print(df2_dropped)



Mean Squared Error: 0.0
Mean Absolute Error: 0.0
R-squared (coefficient of determination): 1.0

Original DataFrame (df1):
      2017    2018    2019
0   192274  201191  265954
1    25181   28523   26735
2     2024    2113    1904
3     4716    5544    8956
4     4105    4307    4481
5    16569   15405    4181
6     1939    2166    1622
7     2670    4952    5212
8    17937   18991   19968
9     4141    3139    2195
10      22      14      31
11      99     101     369
12      19       5      12
13      11       7       9

DataFrame with Missing Values (df2):
      2017      2018      2019
0   192274  201191.0  265954.0
1    25181   28523.0   26735.0
2     2024    2113.0    1904.0
3     4716       NaN    8956.0
4     4105    4307.0       NaN
5    16569   15405.0    4181.0
6     1939    2166.0    1622.0
7     2670    4952.0       NaN
8    17937   18991.0   19968.0
9     4141       NaN       NaN
10      22      14.0      31.0
11      99     101.0     369.0
12      19       NaN      12.0
