#**KNN imputer for quantitative or numerical data**

##**Import necessary libraries**

In [1]:
import numpy as np
import pandas as pd

In [2]:
#from google.colab import drive
#drive.mount('/drive')

##**Import the KNNimputer**

In [3]:
from sklearn.impute import KNNImputer

##**Create dataset for marks of students**

In [4]:
dict = {'Maths':[85, 91, np.nan, 91], 
        'Chemistry': [72, 75, 74, np.nan], 
        'Physics':[np.nan, 68, 82, 81],
       'Biology' : [81,81,72,np.nan]}

## **Creating a data frame from the dictionary**

---



In [5]:
Before_imputation = pd.DataFrame(dict)
Before_imputation

Unnamed: 0,Maths,Chemistry,Physics,Biology
0,85.0,72.0,,81.0
1,91.0,75.0,68.0,81.0
2,,74.0,82.0,72.0
3,91.0,,81.0,


##**Replace the missing values with KNNImputer**

In [6]:
imputer = KNNImputer(n_neighbors=2)
After_imputation = imputer.fit_transform(Before_imputation)
After_imputation                                              #After transforming the data becomes a numpy array.

array([[85. , 72. , 74.5, 81. ],
       [91. , 75. , 68. , 81. ],
       [88. , 74. , 82. , 72. ],
       [91. , 73. , 81. , 76.5]])

##**Convert the array into dataframe**

In [7]:
Before_imputation.columns

Index(['Maths', 'Chemistry', 'Physics', 'Biology'], dtype='object')

In [8]:
After_imputation=pd.DataFrame(After_imputation,columns=['Maths', 'Chemistry', 'Physics', 'Biology'])
After_imputation

Unnamed: 0,Maths,Chemistry,Physics,Biology
0,85.0,72.0,74.5,81.0
1,91.0,75.0,68.0,81.0
2,88.0,74.0,82.0,72.0
3,91.0,73.0,81.0,76.5


#**Task**
1.   **Import necessory libraries for kNN imputation**
2.   **Create dataframe X using following data with column names as 'Class A','Class B', 'Class C', 'Class D'**

---
                X = [[1, 3, np.nan, 4], [6, np.nan, 8, np.nan], [5, 4, 2, 3], [9, np.nan, 6, 8]]

---


3. **find number of missing values in each columns**
4. **find total number of missing values in dataframe**
5. **Find percent missing values in dataframe**
6. **Drop the rows having missing values**
7. **Keep the rows having with atleast 3 non-null values**
8. **Keep the columns having with atleast 3 non-null values**
9. **Drop the columns having missing values**
10. **Fill the missing values with mean values**
11. **Use KNN imputer to impute the missing values**
12. **Save the imputed data in dataframe as variable Y**  





#**1. Import necessory libraries**

In [9]:
import numpy as np
import pandas as pd
from sklearn.impute import KNNImputer

##**2. Create dataframe X using following data with column names as 'Class A','Class B', 'Class C'**

In [10]:
X = [[1, 3, np.nan,4], [6, np.nan, 8,np.nan], [5, 4, 2,3], [9,np.nan,6, 8]]
X

[[1, 3, nan, 4], [6, nan, 8, nan], [5, 4, 2, 3], [9, nan, 6, 8]]

In [11]:
X=pd.DataFrame(X, columns=['Class A','Class B','Class C','Class D'])
X

Unnamed: 0,Class A,Class B,Class C,Class D
0,1,3.0,,4.0
1,6,,8.0,
2,5,4.0,2.0,3.0
3,9,,6.0,8.0


##**3. Find number of missing values in each columns**

In [12]:
X.isna().sum()

Class A    0
Class B    2
Class C    1
Class D    1
dtype: int64

##**4. Find total number of missing values in dataframe**

In [13]:
X.isna().sum().sum()

4

In [14]:
X.count().sum()

12

##**5. Find percent missing values in dataframe**

In [15]:
total_missing_values = X.isna().sum().sum()
percent_missing_values = total_missing_values / (total_missing_values + X.count().sum()) * 100
percent_missing_values

25.0

##**6. Drop the rows having missing values**

In [16]:
X.dropna()

Unnamed: 0,Class A,Class B,Class C,Class D
2,5,4.0,2.0,3.0


##**7. Keep the rows having with atleast 3 non-null values**

In [17]:
X.dropna(thresh=3)

Unnamed: 0,Class A,Class B,Class C,Class D
0,1,3.0,,4.0
2,5,4.0,2.0,3.0
3,9,,6.0,8.0


##**8. Drop the columns having missing values**

In [18]:
X.dropna(axis=1)

Unnamed: 0,Class A
0,1
1,6
2,5
3,9


##**9. Keep the columns having with atleast 3 non-null values**

In [19]:
X.dropna(thresh=3,axis=1)

Unnamed: 0,Class A,Class C,Class D
0,1,,4.0
1,6,8.0,
2,5,2.0,3.0
3,9,6.0,8.0


##**10. Fill the missing values with mean values**

In [20]:
X.mean()

Class A    5.250000
Class B    3.500000
Class C    5.333333
Class D    5.000000
dtype: float64

In [21]:
X.fillna(X.mean())

Unnamed: 0,Class A,Class B,Class C,Class D
0,1,3.0,5.333333,4.0
1,6,3.5,8.0,5.0
2,5,4.0,2.0,3.0
3,9,3.5,6.0,8.0


##**11. Use KNN imputer to impute the missing values**

In [22]:
imputer = KNNImputer(n_neighbors=2)
Y=imputer.fit_transform(X)
Y

array([[1. , 3. , 5. , 4. ],
       [6. , 3.5, 8. , 5.5],
       [5. , 4. , 2. , 3. ],
       [9. , 3.5, 6. , 8. ]])

##**7. Save the imputed data in new dataframe Y**

In [23]:
Y=pd.DataFrame(Y,columns=['Class A','Class B','Class C','Class D'])
Y

Unnamed: 0,Class A,Class B,Class C,Class D
0,1.0,3.0,5.0,4.0
1,6.0,3.5,8.0,5.5
2,5.0,4.0,2.0,3.0
3,9.0,3.5,6.0,8.0


#**KNN Imputer for Qualitative or Categorical data**

##**Import or create required dataset**

In [24]:
dz = {'Name':['Root', 'Stark', 'John', 'Marsh','Sam'], 
        'Age': [31, 34, 33, 35,32], 
        'Department':['Navy', 'Army', np.nan, 'Navy',np.nan],
       'Color_Code': ['White','Green','White',np.nan, 'White']}

In [25]:
dz=pd.DataFrame(dz)
dz

Unnamed: 0,Name,Age,Department,Color_Code
0,Root,31,Navy,White
1,Stark,34,Army,Green
2,John,33,,White
3,Marsh,35,Navy,
4,Sam,32,,White


#**apply KNN imputer**

In [26]:
imputer = KNNImputer(n_neighbors=2)
Y=imputer.fit_transform(dz)

ValueError: ignored

##**Convert string into floats**

In [27]:
dz.columns

Index(['Name', 'Age', 'Department', 'Color_Code'], dtype='object')

##**Map with lables**

In [28]:
dz
dz['Department_map']=dz.Department.map({'Navy':0,'Army':1})
dz['Color_Code_map']=dz.Color_Code.map({'White':0,'Green':1})

In [29]:
dz

Unnamed: 0,Name,Age,Department,Color_Code,Department_map,Color_Code_map
0,Root,31,Navy,White,0.0,0.0
1,Stark,34,Army,Green,1.0,1.0
2,John,33,,White,,0.0
3,Marsh,35,Navy,,0.0,
4,Sam,32,,White,,0.0


In [30]:
dz1=dz[ ['Age','Department_map']]
dz1

Unnamed: 0,Age,Department_map
0,31,0.0
1,34,1.0
2,33,
3,35,0.0
4,32,


####**Apply KNN imputer**

In [31]:
imputer= KNNImputer(n_neighbors=1)
dz2 = imputer.fit_transform(dz1)
dz2

array([[31.,  0.],
       [34.,  1.],
       [33.,  1.],
       [35.,  0.],
       [32.,  0.]])

##**Imputation using other imputers (Categorical_Imputer)**

In [32]:
# using sklearn-pandas package
from sklearn_pandas import CategoricalImputer
 
# handling NaN values
imputer = CategoricalImputer()


data1 = np.array(dz['Department'], dtype=object)
data_1=imputer.fit_transform(data1)


data2 = np.array(dz['Color_Code'], dtype=object)
data_2=imputer.fit_transform(data2)


print(data_1)
print(data_2)

['Navy' 'Army' 'Navy' 'Navy' 'Navy']
['White' 'Green' 'White' 'White' 'White']


##**euclidean_distances**

In [33]:
from sklearn.impute import KNNImputer
from sklearn.metrics.pairwise import nan_euclidean_distances
import numpy as np
X = [[3,np.NaN,6]]
Y = [[2, 0, 0]]
nan_euclidean_distances(X,Y)




array([[7.44983221]])