# DataCleaning by Daniel 
## In this ipynb, we will clean the data and create csv based on Su's document.
## Based on Su's document, we will clean the dataset focusing on the features "funder" and "installer". 

In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style = 'darkgrid')

### 1.0 Load data

In [29]:
#load traning data
train_values = pd.read_csv('data/training_set_values.csv')
train_labels = pd.read_csv('data/training_set_labels.csv')
# Merge these two datasets
df_train = train_values.merge(train_labels, on='id', how='inner')
# df_train.shape
#Load test data
df_test = pd.read_csv('data/test_set_values.csv')
df_test.shape

(14850, 40)

### 1.1 Check null value

In [6]:
# List number of null values
df_train.apply(lambda x: sum(x.isnull()))

id                           0
amount_tsh                   0
date_recorded                0
funder                    3635
gps_height                   0
installer                 3655
longitude                    0
latitude                     0
wpt_name                     0
num_private                  0
basin                        0
subvillage                 371
region                       0
region_code                  0
district_code                0
lga                          0
ward                         0
population                   0
public_meeting            3334
recorded_by                  0
scheme_management         3877
scheme_name              28166
permit                    3056
construction_year            0
extraction_type              0
extraction_type_group        0
extraction_type_class        0
management                   0
management_group             0
payment                      0
payment_type                 0
water_quality                0
quality_

In [8]:
df_train.shape

(59400, 41)

### 1.2 Drop 10+6+2 redundant columns

In [69]:
df_train = df_train.drop(columns = ['extraction_type','extraction_type_group','management','payment','quality_group','quantity_group','source_type','source','waterpoint_type'])
df_test = df_test.drop(columns = ['extraction_type','extraction_type_group','management','payment','quality_group','quantity_group','source_type','source','waterpoint_type'])
# There are too many null values in 'scheme name', so drop it
df_train = df_train.drop(columns = ['scheme_name'])
df_test = df_test.drop(columns = ['scheme_name'])
# df_train.shape

# drop wpt_name,num_private,subvillage,region_code,district_code,recorded_by

df_train = df_train.drop(columns = ['wpt_name'])
df_test = df_test.drop(columns = ['wpt_name'])
df_train = df_train.drop(columns = ['num_private'])
df_test = df_test.drop(columns = ['num_private'])
df_train = df_train.drop(columns=['subvillage'])
df_test = df_test.drop(columns=['subvillage'])
df_train = df_train.drop(columns = ['region_code', 'district_code'])
df_test = df_test.drop(columns = ['region_code', 'district_code'])
df_train = df_train.drop(columns = ['recorded_by'])
df_test = df_test.drop(columns = ['recorded_by'])

# Drop other geographical columns: 'lga', 'ward' 
df_train = df_train.drop(columns=['lga','ward'])
df_test = df_test.drop(columns=['lga','ward'])

KeyError: "['extraction_type' 'extraction_type_group' 'management' 'payment'\n 'quality_group' 'quantity_group' 'source_type' 'source' 'waterpoint_type'] not found in axis"

### 1.3 fill nan
fill nan with 'unknown'

In [67]:
# Attention: there are 3334 null values in this column, not too much, replace NA by unknow
df_train['public_meeting'] = df_train['public_meeting'].fillna('unknown')
df_test['public_meeting'] = df_test['public_meeting'].fillna('unknown')

# Attention: 3056 NA, replace by unknown
df_train['permit'] = df_train['permit'].fillna('unknown')
df_test['permit'] = df_test['permit'].fillna('unknown')

### 1.4 Categories 'construction_year'

In [71]:
# df_train['permit'].isnull().sum()
def con_year_cleaning(df):
    year = df['construction_year'] 
    if year >= 1960 and year < 1970:
        return '60s'
    elif year >= 1970 and year < 1980:
        return '70s'
    elif year >= 1980 and year < 1990:
        return '80s'
    elif year >= 1990 and year < 2000:
        return '90s'
    elif year >= 2000 and year < 2010:
        return '00s'
    elif year >= 2010:
        return '10s'
    else:
        return 'unknown'
df_train['construction_year'] = df_train.apply(lambda row: con_year_cleaning(row), axis=1)
df_test['construction_year'] = df_test.apply(lambda row: con_year_cleaning(row), axis=1)
df_train['construction_year'].unique()

TypeError: ("'>=' not supported between instances of 'str' and 'int'", 'occurred at index 0')

### 1.5 Key Part: Data cleaning for features: funder,installer,scheme_management.
#### we still maintain top5 values using the lowercase of values and then set the other values as "other".

In [35]:
# 'funder'
df_train['funder'].value_counts()

Government Of Tanzania    9084
Danida                    3114
Hesawa                    2202
Rwssp                     1374
World Bank                1349
Kkkt                      1287
World Vision              1246
Unicef                    1057
Tasaf                      877
District Council           843
Dhv                        829
Private Individual         826
Dwsp                       811
0                          777
Norad                      765
Germany Republi            610
Tcrs                       602
Ministry Of Water          590
Water                      583
Dwe                        484
Netherlands                470
Hifab                      450
Adb                        448
Lga                        442
Amref                      425
Fini Water                 393
Oxfam                      359
Wateraid                   333
Rc Church                  321
Isf                        316
                          ... 
Said Salum Ally              1
Bao     

In [44]:
# Top 5 will be remained as it is and after that they will be categorised into other
def funder_cleaning(df):
    data = str(df['funder']) # turn into string first
    
    if data.lower() == 'Government Of Tanzania'.lower():
        return 'government'
    elif data.lower() == 'Danida'.lower():
        return 'danida'
    elif data.lower() == 'Hesawa'.lower():
        return 'hesawa'
    elif data.lower() == 'Rwssp'.lower():
        return 'rwssp'
    elif data.lower() == 'World Bank'.lower():
        return 'world_bank'
#     else:
#         return data

    else:
        return 'other'

In [46]:
# d_fun = df_train.apply(lambda row: funder_cleaning(row), axis=1)
df_train['funder'] = df_train.apply(lambda row: funder_cleaning(row), axis=1)
df_train['funder'].value_counts()

other         42277
government     9084
danida         3114
hesawa         2202
rwssp          1374
world_bank     1349
Name: funder, dtype: int64

In [36]:
# 'installer'
df_train['installer'].value_counts()

DWE                           17402
Government                     1825
RWE                            1206
Commu                          1060
DANIDA                         1050
KKKT                            898
Hesawa                          840
0                               777
TCRS                            707
Central government              622
CES                             610
Community                       553
DANID                           552
District Council                551
HESAWA                          539
World vision                    408
LGA                             408
WEDECO                          397
TASAF                           396
District council                392
Gover                           383
AMREF                           329
TWESA                           316
WU                              301
Dmdd                            287
ACRA                            278
World Vision                    270
SEMA                        

In [49]:
# The same, remain top 5
def installer_cleaning(df):
    data = str(df['installer']).lower()
    if data == 'DWE'.lower():
        return 'dwe'
    elif data == 'Government'.lower():
        return 'government'
    elif data == 'RWE'.lower():
        return 'rwe'
    elif data == 'Commu'.lower():
        return 'commu'
    elif data == 'DANIDA'.lower():
        return 'danida'
#     else:
#         return data
    else:
        return 'other'

In [50]:
d_inst = df_train.apply(lambda row: installer_cleaning(row), axis=1)
d_inst.value_counts()

other         36783
dwe           17405
government     1891
rwe            1206
commu          1065
danida         1050
dtype: int64

In [55]:
# Keep top 5
def scheme_cleaning(df):
    data = str(df['scheme_management']).lower()
    if data == 'VWC'.lower():
        return 'vwc'
    elif data == 'WUG'.lower():
        return 'wug'
    elif data == 'Water authority'.lower():
        return 'water_authority'
    elif data == 'WUA'.lower():
        return 'wua'
    elif data == 'Water Board'.lower():
        return 'water_board'
    else:
        return 'other'

In [56]:
df_train['scheme_management'] = df_train.apply(lambda row: scheme_cleaning(row), axis=1)
df_train['scheme_management'].value_counts()

vwc                36793
other               8617
wug                 5206
water_authority     3153
wua                 2883
water_board         2748
Name: scheme_management, dtype: int64

# Note: (Uncomplete)
### 1.6 transform the time(mainly based on the date_recorded)
### This part from Zii

In [77]:
# just a try
# Transform the string date into days since recorded
# df_train['date_recorded'] = pd.to_datetime(df_train['date_recorded']) - pd.to_datetime(df_train['construction_year'])
# df_train.rename(columns = {'date_recorded':'days_since_recorded'}, inplace=True)
# df_test['date_recorded'] = pd.to_datetime(df_test['date_recorded']) - pd.to_datetime(df_test['construction_year'])
# df_test.rename(columns = {'date_recorded':'days_since_recorded'}, inplace=True)
# df_train['days_since_recorded'] = df_train['days_since_recorded'].astype('timedelta64[D]').astype(int)
# df_test['days_since_recorded'] = df_test['days_since_recorded'].astype('timedelta64[D]').astype(int)

# Differ from Su's, I save the 'longitude', 'latitude' 

# After the record time and population , we can tocsv and get a csv file.

In [78]:
# to csv