In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [21]:
#Importing raw data and joining the tables together 

raw_values = pd.read_csv('raw_data/tanzania_values.csv', index_col = 'id')
raw_labels = pd.read_csv('raw_data/tanzania_labels.csv', index_col = 'id')
raw_df = raw_values.join(raw_labels)

#Separating the test and train data 
raw_train_df, test_df = train_test_split(raw_df, test_size = .25, random_state = 47 )

#Saving the test data
test_df.to_csv("Test_Data.csv")


In [22]:
#Creating the initial training dataframe, dropping redundant or completely empty columns
init_train_df = raw_train_df.drop(columns = ["quantity_group", "scheme_name", "num_private", "payment_type", "waterpoint_type_group"])



In [None]:
SI = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df_imputed = SI.fit_transform(init_train_df)

#Creating the actual final training dataframe
train_df = pd.DataFrame(data=df_imputed, columns=init_train_df.columns)



In [None]:
#Correcting date column 
train_df.date_recorded = pd.to_datetime(train_df.date_recorded)

#Correcting other datatypes
train_df = train_df.astype({'amount_tsh' : 'float',
                 'longitude'          : 'float' ,
                 'latitude'           : 'float',
                 'population'         : 'int' ,
                 'permit'             : 'bool',
                 'construction_year'  : 'int',
                 'gps_height'         : 'float',
                 'public_meeting'     : 'bool'})

train_df.info() #Checking to make sure datatypes have been changed

In [None]:
train_df.isna().sum().sum() #No null values here! 

In [None]:
#saving the training data
train_df.to_csv("Train_Data.csv")