In [48]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

In [49]:
#Importing raw data and joining the tables together 

raw_values = pd.read_csv('raw_data/taarifa_values.csv', index_col = 'id')
raw_labels = pd.read_csv('raw_data/taarifa_labels.csv', index_col = 'id')
raw_df = raw_values.join(raw_labels)

#Separating the test and train data 
raw_train_df, test_df = train_test_split(raw_df, test_size = .25, random_state = 47 )

#Saving the test data
test_df.to_csv("Test_Data.csv")


In [50]:
# raw_df[['amount_tsh', 'source_type', 'source_class', 'water_quality', 'quality_group', 'quantity', 'quantity_group']].head(20)
# raw_df.head().transpose()

# raw_df.dtypes

# (raw_df.quantity == raw_df.quantity_group).value_counts() 
#drop scheme name - almost missing half in the dataset
#drop quantity_group -- exactly the same as quantity || payment_type is the same as payment, waterpoint_type_group 
#drop rows that're missing values on scheme_management, funder, installer, sub_village 
# raw_df.head()

In [55]:
#Creating the initial training dataframe, dropping redundant or completely empty columns
init_train_df = raw_train_df.drop(columns = ["quantity_group", "scheme_name", "num_private", "payment_type", "waterpoint_type_group"])



In [60]:
# cols = list()

SI = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
df_imputed = SI.fit_transform(init_train_df)
train_df = pd.DataFrame(data=df_imputed, columns=train_df.columns)



In [82]:
#Correcting date column 
train_df.date_recorded = pd.to_datetime(train_df.date_recorded)

#Correcting other datatypes
train_df = train_df.astype({'amount_tsh' : 'float',
                 'longitude'          : 'float' ,
                 'latitude'           : 'float',
                 'population'         : 'int' ,
                 'permit'             : 'bool',
                 'construction_year'  : 'int',
                 'gps_height'         : 'float',
                 'public_meeting'     : 'bool'})

train_df.info() #Checking to make sure datatypes have been changed

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 44550 entries, 0 to 44549
Data columns (total 35 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   amount_tsh             44550 non-null  float64       
 1   date_recorded          44550 non-null  datetime64[ns]
 2   funder                 44550 non-null  object        
 3   gps_height             44550 non-null  float64       
 4   installer              44550 non-null  object        
 5   longitude              44550 non-null  float64       
 6   latitude               44550 non-null  float64       
 7   wpt_name               44550 non-null  object        
 8   basin                  44550 non-null  object        
 9   subvillage             44550 non-null  object        
 10  region                 44550 non-null  object        
 11  region_code            44550 non-null  object        
 12  district_code          44550 non-null  object        
 13  l

In [83]:
train_df.isna().sum().sum() #No null values here! 

0

In [84]:
train_df.to_csv("Train_Data.csv")