# Initial Data Cleaning

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [5]:
#Importing raw data and joining the tables together 

raw_values = pd.read_csv('raw_data/tanzania_values.csv', index_col = 'id')
raw_labels = pd.read_csv('raw_data/tanzania_labels.csv', index_col = 'id')
raw_df = raw_values.join(raw_labels)

#Separating the test and train data 
raw_train_df, test_df = train_test_split(raw_df, test_size = .25, random_state = 47 )

#Saving the test data
test_df.to_pickle("Test_Data.pkl")


In [6]:
#Creating the initial training dataframe, dropping redundant or completely empty columns
init_train_df = raw_train_df.drop(columns = ["quantity_group", "scheme_name", "num_private", "payment_type", "waterpoint_type_group"])



In [7]:
SI = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df_imputed = SI.fit_transform(init_train_df)

#Creating the actual final training dataframe
train_df = pd.DataFrame(data=df_imputed, columns=init_train_df.columns)



In [13]:
#Correcting date column 
train_df.date_recorded = pd.to_datetime(train_df.date_recorded)

#Correcting other datatypes
train_df = train_df.astype({'amount_tsh' : 'float',
                 'longitude'          : 'float' ,
                 'latitude'           : 'float',
                 'basin'              : 'category',
                 'region'             : 'category',
                 'population'         : 'int' ,
                 'permit'             : 'bool',
                 'construction_year'  : 'int',
                 'gps_height'         : 'float',
                 'public_meeting'     : 'bool'})

lst = ['public_meeting','gps_height','construction_year','permit','population','region','basin','latitude','longitude','amount_tsh']

# train_df.info() #Checking to make sure datatypes have been changed

In [10]:
train_df.region.value_counts()

Iringa           3918
Shinyanga        3762
Mbeya            3457
Kilimanjaro      3298
Morogoro         3090
Arusha           2510
Kagera           2500
Mwanza           2305
Kigoma           2083
Ruvuma           1972
Pwani            1955
Tanga            1922
Dodoma           1666
Singida          1566
Tabora           1499
Mara             1461
Rukwa            1341
Mtwara           1277
Manyara          1222
Lindi            1147
Dar es Salaam     599
Name: region, dtype: int64

In [35]:
train_df.isna().sum().sum() #No null values here! 

0

# Label Encoding with SKLearn

In [44]:
from sklearn.preprocessing import LabelEncoder

cols_to_encode = train_df.drop(columns = ['date_recorded', 'amount_tsh', 'gps_height',
                        'longitude', 'latitude', 'population', 
                        'construction_year'])
cols_to_encode = cols_to_encode.columns.to_list()

lb = LabelEncoder()

for col_name in cols_to_encode:
    col = col_name + '_code'
    train_df[col] = lb.fit_transform(train_df[col_name])

In [60]:
cols_to_encode = train_df.drop(columns = ['date_recorded', 'amount_tsh', 'gps_height',
                        'longitude', 'latitude', 'population', 
                        'construction_year'])
cols_to_encode = cols_to_encode.columns.to_list()

In [81]:
lb = LabelEncoder()

for col_name in cols_to_encode:
    col = col_name + '_code'
    train_df[col] = lb.fit_transform(train_df[col_name])


In [78]:
train_df.to_pickle('Train_Data.pkl')

In [79]:
train_df

Unnamed: 0,amount_tsh,date_recorded,funder,gps_height,installer,longitude,latitude,wpt_name,basin,subvillage,...,management_group_code,payment_code,water_quality_code,quality_group_code,quantity_code,source_code,source_type_code,source_class_code,waterpoint_type_code,status_group_code
0,0.0,2013-02-05,Tcrs,1295.0,Community,30.617206,-3.574069,Ndagije,Lake Tanganyika,Nakibondo,...,4,6,6,2,1,8,6,0,5,0
1,50.0,2011-03-12,Private Individual,181.0,WU,38.354049,-6.642347,Digali,Wami / Ruvu,Mbala,...,0,4,6,2,1,6,4,1,1,0
2,300.0,2011-02-26,Ki,490.0,Ki,37.048901,-6.758648,Shuleni,Wami / Ruvu,Gongoni,...,4,5,6,2,2,3,0,0,1,2
3,1000.0,2011-02-27,Twe,2047.0,TWE,34.441943,-9.581917,none,Lake Nyasa,Magongolo,...,4,3,6,2,2,8,6,0,1,0
4,500.0,2013-03-27,Rc Church,644.0,RC CHURCH,37.876094,-4.454896,Kwa Elifuraha,Pangani,Saweni,...,4,3,6,2,2,8,6,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44545,1000.0,2011-03-16,Mkinga Distric Cou,0.0,DWE,38.799525,-4.860363,Mwangee,Pangani,Mwangee,...,0,3,6,2,2,8,6,0,1,0
44546,0.0,2013-02-19,Government Of Tanzania,1618.0,DWE,34.974044,-5.087319,Risu,Internal,Mughuka,...,4,6,7,5,0,7,5,0,4,2
44547,0.0,2013-03-15,0,-28.0,0,39.385389,-6.876464,Kwa Mzee Makata,Wami / Ruvu,Kizani,...,1,0,4,4,0,3,0,0,1,2
44548,20000.0,2011-02-28,Acra,1970.0,ACRA,34.713414,-9.330408,none,Rufiji,Wikichi,...,4,2,6,2,1,8,6,0,1,0
