## 0. Import Data

In [10]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn import preprocessing

In [11]:
df = pd.read_csv('data/Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_values.csv')
df2 = pd.read_csv('data/Pump_it_Up_Data_Mining_the_Water_Table_-_Training_set_labels.csv')

## 1.  Train Test Split

In [12]:
X_train, X_test, y_train, y_test = train_test_split(df, df2, test_size = 0.25, random_state = 47)

## 2. Missing Values

__Missing Values Decisions__
- drop scheme_name column
- drop funder column, based on research it is unreliable and highly varied
- drop NAN rows from scheme_management, funder, installer, subvillage, permit 

__Non-Relevant Data__
- num_private has no description and is all zeros, removing column. 
- drop payment_type, quantity group, waterpoint_type_group because these are redundant with payment, quantity.

__Categorical Features__
- All except latitude, longitude, gps height, population, construction_year

__Rows__
- we decided not to drop any rows 

In [None]:
#dropping columns
X_train = X_train.drop(columns=['scheme_name', 'payment_type', 'quantity_group', 'waterpoint_type_group', 'num_private'])

### Imputing

In [19]:
X_train.isna().sum()

id                          0
amount_tsh                  0
date_recorded               0
funder                   2720
gps_height                  0
installer                2740
longitude                   0
latitude                    0
wpt_name                    0
basin                       0
subvillage                286
region                      0
region_code                 0
district_code               0
lga                         0
ward                        0
population                  0
public_meeting           2508
recorded_by                 0
scheme_management        2938
permit                   2263
construction_year           0
extraction_type             0
extraction_type_group       0
extraction_type_class       0
management                  0
management_group            0
payment                     0
water_quality               0
quality_group               0
quantity                    0
source                      0
source_type                 0
source_cla

In [20]:
cols = list(X_train.columns)

In [21]:
SI = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train_imputed = SI.fit_transform(X_train)
X_train_imputed

array([[29040, 0.0, '2013-02-05', ..., 'spring', 'groundwater',
        'improved spring'],
       [69788, 50.0, '2011-03-12', ..., 'river/lake', 'surface',
        'communal standpipe'],
       [51208, 300.0, '2011-02-26', ..., 'borehole', 'groundwater',
        'communal standpipe'],
       ...,
       [53875, 0.0, '2013-03-15', ..., 'borehole', 'groundwater',
        'communal standpipe'],
       [8608, 20000.0, '2011-02-28', ..., 'spring', 'groundwater',
        'communal standpipe'],
       [60477, 0.0, '2011-03-14', ..., 'borehole', 'groundwater',
        'communal standpipe']], dtype=object)

In [22]:
X_train_imputed = pd.DataFrame(data=X_train_imputed, columns=cols)
X_train_imputed.head(3)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,...,management,management_group,payment,water_quality,quality_group,quantity,source,source_type,source_class,waterpoint_type
0,29040,0,2013-02-05,Tcrs,1295,Community,30.6172,-3.57407,Ndagije,Lake Tanganyika,...,vwc,user-group,unknown,soft,good,enough,spring,spring,groundwater,improved spring
1,69788,50,2011-03-12,Private Individual,181,WU,38.354,-6.64235,Digali,Wami / Ruvu,...,private operator,commercial,pay per bucket,soft,good,enough,river,river/lake,surface,communal standpipe
2,51208,300,2011-02-26,Ki,490,Ki,37.0489,-6.75865,Shuleni,Wami / Ruvu,...,vwc,user-group,pay when scheme fails,soft,good,insufficient,machine dbh,borehole,groundwater,communal standpipe


In [23]:
X_train_imputed.isna().sum()

id                       0
amount_tsh               0
date_recorded            0
funder                   0
gps_height               0
installer                0
longitude                0
latitude                 0
wpt_name                 0
basin                    0
subvillage               0
region                   0
region_code              0
district_code            0
lga                      0
ward                     0
population               0
public_meeting           0
recorded_by              0
scheme_management        0
permit                   0
construction_year        0
extraction_type          0
extraction_type_group    0
extraction_type_class    0
management               0
management_group         0
payment                  0
water_quality            0
quality_group            0
quantity                 0
source                   0
source_type              0
source_class             0
waterpoint_type          0
dtype: int64

### Change Datatypes

In [24]:
#change date to datetime format
X_train.date_recorded = pd.to_datetime(X_train.date_recorded)

In [25]:
X_train.dtypes

id                                int64
amount_tsh                      float64
date_recorded            datetime64[ns]
funder                           object
gps_height                        int64
installer                        object
longitude                       float64
latitude                        float64
wpt_name                         object
basin                            object
subvillage                       object
region                           object
region_code                       int64
district_code                     int64
lga                              object
ward                             object
population                        int64
public_meeting                   object
recorded_by                      object
scheme_management                object
permit                           object
construction_year                 int64
extraction_type                  object
extraction_type_group            object
extraction_type_class            object


In [32]:
# creating a list of non-numeric columns
obj_cols = ['installer', 'wpt_name', 'basin', 'subvillage', 'region', 'lga', 'ward', 
              'public_meeting', 'recorded_by', 'scheme_management', 'permit', 
               'extraction_type', 'extraction_type_group', 'extraction_type_class', 'management', 
               'management_group', 'payment', 'water_quality', 'quality_group', 
               'quantity', 'source', 'source_type', 'source_class', 
               'waterpoint_type']

In [33]:
#creating a list of numeric columns
num_cols = ['amount_tsh', 'gps_height', 'longitude', 'latitude', 'num_private', 'region_code', 'district_code', 
            'population', 'construction_year', 'id_code']

In [34]:
#changing non-numeric columns to strings
for col in obj_cols:
            X_train[col] = X_train[col].astype(str)

In [35]:
# return the name of the feature if the value of the name in the dtype list is object
[value for value in X_train.dtypes if type(value) == object]
# the empty list is saying that there are no object types, they have been changed to strings.

[]

## Numerical Encoding

In [36]:
# instantiate encoding
le = preprocessing.LabelEncoder()

#encode X_train
for col_name in obj_cols:
    col = col_name + '_code'
    #print(col_name)
    le.fit(X_train[col_name])    
    X_train[col] = le.fit_transform(X_train[col_name])

In [37]:
X_train.head(3)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,...,management_code,management_group_code,payment_code,water_quality_code,quality_group_code,quantity_code,source_code,source_type_code,source_class_code,waterpoint_type_code
8956,29040,0.0,2013-02-05,Tcrs,1295,Community,30.617206,-3.574069,Ndagije,Lake Tanganyika,...,7,4,6,6,2,1,8,6,0,5
16599,69788,50.0,2011-03-12,Private Individual,181,WU,38.354049,-6.642347,Digali,Wami / Ruvu,...,4,0,4,6,2,1,6,4,1,1
53788,51208,300.0,2011-02-26,Ki,490,Ki,37.048901,-6.758648,Shuleni,Wami / Ruvu,...,7,4,5,6,2,2,3,0,0,1


In [38]:
X_train_encoded = X_train.drop(columns=obj_cols)

X_train_encoded.head(3)

Unnamed: 0,id,amount_tsh,date_recorded,funder,gps_height,longitude,latitude,region_code,district_code,population,...,management_code,management_group_code,payment_code,water_quality_code,quality_group_code,quantity_code,source_code,source_type_code,source_class_code,waterpoint_type_code
8956,29040,0.0,2013-02-05,Tcrs,1295,30.617206,-3.574069,5,1,120,...,7,4,6,6,2,1,8,6,0,5
16599,69788,50.0,2011-03-12,Private Individual,181,38.354049,-6.642347,14,1,50,...,4,0,4,6,2,1,6,4,1,1
53788,51208,300.0,2011-02-26,Ki,490,37.048901,-6.758648,11,1,500,...,7,4,5,6,2,2,3,0,0,1


In [47]:
#encode y_train
y_train['status_group'] = le.fit_transform(y_train['status_group'])
y_train_encoded = y_train
y_train_encoded.head(3)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,status_group
8956,29040,0
16599,69788,0
53788,51208,2


## Export Pickle File

In [48]:
import pickle
X_train_encoded.to_pickle('X_train_encoded.pickle')
y_train_encoded.to_pickle('y_train_encoded.pickle')