In [31]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer
from sklearn import preprocessing

In [32]:
df = pd.read_csv("df_merged.csv")

## Data Cleaning

### KNN Imputations

In [33]:
imputer = KNNImputer(n_neighbors = 5)

In [34]:
df.isna().sum()

Unnamed: 0                 0
country                    0
code                       0
year                       0
stress                     0
agriculture_water          0
industry_water            17
domestic_water             0
water_per_capita          16
urban_population           0
state_capacity             0
cattle                    16
total_withdrawals          0
cereal_yield              48
control_of_corruption    139
hydro_electricity        717
permanent_cropland         0
population_density         0
dtype: int64

In [35]:
imputed_values = imputer.fit_transform(df[['cattle', 'total_withdrawals']])
df[['cattle', 'total_withdrawals']] = imputed_values

In [36]:
df.isna().sum()

Unnamed: 0                 0
country                    0
code                       0
year                       0
stress                     0
agriculture_water          0
industry_water            17
domestic_water             0
water_per_capita          16
urban_population           0
state_capacity             0
cattle                     0
total_withdrawals          0
cereal_yield              48
control_of_corruption    139
hydro_electricity        717
permanent_cropland         0
population_density         0
dtype: int64

In [37]:
imputed_values = imputer.fit_transform(df[['state_capacity', 'control_of_corruption']])
df[['state_capacity', 'control_of_corruption']] = imputed_values

In [38]:
df.isna().sum()

Unnamed: 0                 0
country                    0
code                       0
year                       0
stress                     0
agriculture_water          0
industry_water            17
domestic_water             0
water_per_capita          16
urban_population           0
state_capacity             0
cattle                     0
total_withdrawals          0
cereal_yield              48
control_of_corruption      0
hydro_electricity        717
permanent_cropland         0
population_density         0
dtype: int64

In [39]:
imputed_values = imputer.fit_transform(df[['agriculture_water', 'industry_water']])
df[['agriculture_water', 'industry_water']] = imputed_values

### Column Mean Imputations

In [40]:
df.fillna(df[['stress',
                          'agriculture_water',
                          'industry_water',
                          'domestic_water',
                          'water_per_capita',
                          'urban_population',
                          'state_capacity',
                          'cattle',
                          'total_withdrawals',
                          'cereal_yield',
                          'control_of_corruption',
                          'hydro_electricity',
                          'permanent_cropland',
                          'population_density'
                          ]].mean(), inplace = True)

In [41]:
df.isna().sum()

Unnamed: 0               0
country                  0
code                     0
year                     0
stress                   0
agriculture_water        0
industry_water           0
domestic_water           0
water_per_capita         0
urban_population         0
state_capacity           0
cattle                   0
total_withdrawals        0
cereal_yield             0
control_of_corruption    0
hydro_electricity        0
permanent_cropland       0
population_density       0
dtype: int64

## Data Preparation

In [42]:
features_matrix = df[['agriculture_water',
                            'industry_water',
                            'domestic_water',
                            'water_per_capita',
                            'urban_population', 
                            'state_capacity', 
                            'cattle', 
                            'total_withdrawals',
                            'cereal_yield', 
                            'control_of_corruption', 
                            'hydro_electricity',
                            'permanent_cropland', 
                            'population_density'
                            ]]

In [43]:
# df.to_csv("df_final_unnormalized.csv")

### Normalization

In [44]:
x = features_matrix.values
min_max_scaler = preprocessing.MinMaxScaler()
x_scaled = min_max_scaler.fit_transform(x)
df[['agriculture_water',
          'industry_water',
          'domestic_water',
          'water_per_capita',
          'urban_population',
          'state_capacity',
          'cattle',
          'total_withdrawals',
          'cereal_yield', 
          'control_of_corruption', 
          'hydro_electricity', 
          'permanent_cropland', 
          'population_density'
          ]] = x_scaled

In [45]:
# df.to_csv("df_final.csv")