In [None]:
# Import Statements
import os

import pandas as pd
import numpy as np

# Load Dataset files
    We will be working with the following files:
    * Training set values
    * Training set labels
    * Test set values


In [None]:
train_values = pd.read_csv("./datasets/train_values.csv")
train_labels = pd.read_csv("./datasets/train_labels.csv")

test_values = pd.read_csv("./datasets/test_values.csv")

## Show Train Values

In [None]:
print(train_values)
print("*" * 89)
print()

## Show Train Labels

In [None]:
print(train_labels)
print("*" * 89)

## Show Test Values

In [None]:
print(test_values)
print("*" * 89)

In [None]:

# Count the number of missing values per column
#print(train_values.describe())
#print("-" * 89)
print(train_values.info())

In [None]:
# Count the number of missing values per column
#print(test_values.describe())
#print("-" * 89)
print(test_values.info())

In [None]:
column_labels = list(train_values.columns.values)
column_labels.remove("id")
column_labels.remove("amount_tsh")
column_labels.remove("date_recorded")
column_labels.remove("gps_height")
column_labels.remove("longitude")
column_labels.remove("latitude")
column_labels.remove("num_private")
column_labels.remove("region_code")
column_labels.remove("district_code")
column_labels.remove("population")
column_labels.remove("construction_year")

In [None]:
test_values = test_values.fillna(test_values.median())

In [None]:
for col in column_labels:
    unique_values = list(set(np.concatenate((train_values[col].unique(), test_values[col].unique()))))
    size = len(unique_values)
    print(size)
    for s in range(size):
        if unique_values[s] != "nan":
            train_values.loc[train_values[col] == unique_values[s], col] = s
            test_values.loc[test_values[col] == unique_values[s], col] = s

train_values = train_values.fillna(train_values.median())
test_values = test_values.fillna(test_values.median())

In [None]:
train_values.to_csv("./datasets/train.csv", index=False, header=True)
test_values.to_csv("./datasets/test.csv", index=False, header=True)

In [None]:
col_names = list(train_values.columns)
print(col_names)
print(len(col_names))

In [None]:
filled_train_val = train_values.interpolate(method='linear', limit_direction='forward').fillna("NODATA")
filled_test_val = test_values.interpolate(method='linear', limit_direction='forward').fillna("NODATA")

In [None]:
print(filled_train_val.info())

In [None]:
print(filled_test_val.info())

In [None]:
selected_cols = [col_names[idx] for idx in [0,1,4,6,7,9,13,14,17,18,20,22,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39]]
print(selected_cols)
print(len(selected_cols))

In [None]:
selected_train_val = filled_train_val[selected_cols].set_index("id")
selected_test_val = filled_test_val[selected_cols].set_index("id")

In [None]:
selected_train_val.describe()

In [None]:
selected_test_val.describe()

In [None]:
normalize_train = selected_train_val
for (name,data) in normalize_train.iteritems():
    if normalize_train[name].dtypes != np.object:
        normalize_train[name] = ((data - data.min())/(data.max() - data.min()))

In [None]:
normalize_train.describe()

In [None]:
normalize_test = selected_test_val
for(name,data) in normalize_test.iteritems():
    if normalize_test[name].dtypes != np.object:
        normalize_test[name] = ((data - data.min())/(data.max() - data.min()))

In [None]:
normalize_test.describe()

In [None]:
train_unique = {}
for (name, data) in normalize_train.iteritems():
    if normalize_train[name].dtypes == np.object:
        unique = list(data.unique())
        unique.insert(0,"NODATA")
        unique = list(set(unique))
        no = unique.index('NODATA')        
        unique[0], unique[no] = unique[no], unique[0]       
        train_unique[name] = {unique[idx] : idx for idx in range(len(unique))}

In [None]:
object_normalize_train = normalize_train.select_dtypes(include=['object']).copy()
object_normalize_train.replace(train_unique, inplace=True)

object_normalize_test = normalize_test.select_dtypes(include=['object']).copy()
object_normalize_test.replace(train_unique, inplace=True)

In [None]:
normalize_train.info(0)

In [None]:
train = normalize_train.select_dtypes(include=['float64']).merge(object_normalize_train, left_index=True, right_index=True)


In [None]:
test = normalize_test.select_dtypes(include=['float64']).merge(object_normalize_test, left_index=True, right_index=True)


In [None]:
train.to_csv("./datasets/train.csv", index=True, header=True)
test.to_csv("./datasets/test.csv", index=True, header=True)

# Check if the ID column of both the formatted train dataset is equal to the id column of the train labels dataset.

In [None]:
pd.read_csv("./datasets/train.csv")["id"].equals(train_labels["id"])

## Save dataset

In [None]:
label_unique = {"functional": 0, "functional needs repair" : 1, "non functional" : 2 }
labels = train_labels["status_group"].copy()
for idx, value in labels.iteritems():
    labels[idx] = label_unique[value]

In [None]:
merged_train = pd.read_csv("./datasets/train.csv")
merged_train["status_group"] = train_labels["status_group"]

In [None]:
labels.describe()


In [None]:
labels.to_csv("./datasets/int_labels.csv", index=False, header=True)

In [None]:
merged_train = pd.read_csv("./datasets/train.csv")
merged_train["status_group"] = train_labels["status_group"]