# Missing Values


### Import Libraries

In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

### Load Data

In [2]:
df = pd.read_csv("clv_data.csv")

### Checking Null Values

In [5]:
df.isnull().sum()

Unnamed: 0             0
id                     0
age                 2455
gender                 0
income                 0
days_on_platform       0
city                   0
purchases              0
dtype: int64

### Dropping Null Values

In [6]:
df.dropna()

Unnamed: 0.1,Unnamed: 0,id,age,gender,income,days_on_platform,city,purchases
4,4,4,25.0,Male,246676,1,Miami,1
5,5,5,29.0,Male,10178,0,London,0
6,6,6,47.0,Female,233154,1,London,1
7,7,7,27.0,Female,20732,0,London,2
8,8,8,39.0,Female,37934,0,Miami,0
...,...,...,...,...,...,...,...,...
4992,4992,4992,14.0,Female,65371,3,London,1
4993,4993,4993,40.0,Female,89222,1,London,4
4994,4994,4994,33.0,Female,29859,2,San Francisco,2
4996,4996,4996,46.0,Male,147961,0,Miami,2


### Mean/Median/Mode Imputation

In [9]:
## Mean
df.loc[:,'age'] = df['age'].fillna(np.mean(df['age']))

In [10]:
## Median
df.loc[:,'age'] = df['age'].fillna(np.median(df['age']))

In [17]:
## Mode
df.loc[:,'age'] = df['age'].fillna(stats.mode(df['age'])[0][0])

### Multiple Imputation Using Regression

`estimator`: Different estimators:

- `BayesianRidge`: Regularized Linear Regression

- `RandomForestRegressor`: Random Forest Model. Mimics missForecast.

- `KNeighborsRegressor`: K-Nearest Neighbor Regressor

`missing_values`: Placeholder for missing values.

`max_iter`: The number of iteration rounds.

In [48]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer, KNNImputer

## Target - Purchases in the first six months

X = df[['age','days_on_platform','income']]
X_train = X_train[:4000]
X_test = X[1000:]


Imp = IterativeImputer(max_iter=10, random_state = 0)
Imp.fit(X_train)
X_test = Imp.transform(X_test)

X_test_imp = pd.DataFrame(X_test)
X_test_imp.columns = X_train.columns

imputed_df = pd.concat([X_train,X_test_imp],axis = 0)

### Nearest Neighbor Imputation

**Important Parameters**

`missing_values`: The placeholder for the missing values. 

`n_neighbors`: The number of neighbors to use for imputation.

`weights`: Pick how you want to weight all points in each neighborhood

- `uniform` : uniform weights. All points in each neighborhood are weighted equally.

- `distance` : weight points by the inverse of their distance. in this case, closer neighbors of a query point will have a greater influence than neighbors which are further away.

- `callable` : a user-defined function which accepts an array of distances, and returns an array of the same shape containing the weights.

`metric`: The distance metric used to search for neighbors.

`add_indicator`: This will add a feature if the value was imputed. This allows the estimator to account for the missing data as another feature, on top of imputation.

In [49]:
imputer = KNNImputer(n_neighbors=2, weights="uniform")
imputer.fit_transform(X)

array([[2.99343811e+01, 0.00000000e+00, 8.74470000e+04],
       [2.99343811e+01, 2.00000000e+00, 3.42360000e+04],
       [2.99343811e+01, 1.00000000e+00, 1.28247000e+05],
       ...,
       [2.99343811e+01, 2.00000000e+00, 6.85820000e+04],
       [3.60000000e+01, 2.00000000e+00, 1.72824000e+05],
       [2.99343811e+01, 0.00000000e+00, 1.30800000e+04]])