## Import Statements

In [31]:
import pandas as pd
import numpy as np

from sklearn.preprocessing import MinMaxScaler

import matplotlib.pyplot as plt
import seaborn as sbn

## Load Datasets

In [32]:
train_labels_df = pd.read_csv("../datasets/train_labels.csv")
train_values_df = pd.read_csv("../datasets/train_values.csv")
test_values_df = pd.read_csv("../datasets/test_values.csv")
submission_format_df = pd.read_csv("../datasets/submission_format.csv")

## Merge train data

In [33]:
train_df = train_values_df.merge(train_labels_df, on="building_id")

## Function to factorize columns with dtype == np.object

In [34]:
uniques_values = {}

In [35]:
def convert_object_to_numeric(data_frame : pd.DataFrame):
    for col_name in data_frame:
        if data_frame[col_name].dtypes == np.object:
            codes, uniques = pd.factorize(data_frame[col_name])
            uniques_values[col_name] = uniques
            data_frame[col_name] = pd.Series(codes)

## Convert train_df and test_values_df

In [36]:
convert_object_to_numeric(train_df)

In [37]:
convert_object_to_numeric(test_values_df)

## Check for null values

In [38]:
train_df.isnull().sum().sum()

0

In [39]:
test_values_df.isnull().sum().sum()

0

## Save dataframes

In [40]:
train_df.to_csv("../datasets/preprocessed/train_no_object.csv", index=False, header=True)
test_values_df.to_csv("../datasets/preprocessed/test_no_object.csv", index=False, header=True)

## Mormalize large valued columns.

In [41]:
def get_max_value(data_frame : pd.DataFrame):
    return [(data_frame[col_name].max(), col_name) for col_name in data_frame]

In [42]:
get_max_value(train_df)

[(1052934, 'building_id'),
 (30, 'geo_level_1_id'),
 (1427, 'geo_level_2_id'),
 (12567, 'geo_level_3_id'),
 (9, 'count_floors_pre_eq'),
 (995, 'age'),
 (100, 'area_percentage'),
 (32, 'height_percentage'),
 (2, 'land_surface_condition'),
 (4, 'foundation_type'),
 (2, 'roof_type'),
 (4, 'ground_floor_type'),
 (3, 'other_floor_type'),
 (3, 'position'),
 (9, 'plan_configuration'),
 (1, 'has_superstructure_adobe_mud'),
 (1, 'has_superstructure_mud_mortar_stone'),
 (1, 'has_superstructure_stone_flag'),
 (1, 'has_superstructure_cement_mortar_stone'),
 (1, 'has_superstructure_mud_mortar_brick'),
 (1, 'has_superstructure_cement_mortar_brick'),
 (1, 'has_superstructure_timber'),
 (1, 'has_superstructure_bamboo'),
 (1, 'has_superstructure_rc_non_engineered'),
 (1, 'has_superstructure_rc_engineered'),
 (1, 'has_superstructure_other'),
 (3, 'legal_ownership_status'),
 (9, 'count_families'),
 (1, 'has_secondary_use'),
 (1, 'has_secondary_use_agriculture'),
 (1, 'has_secondary_use_hotel'),
 (1, 'has

In [43]:
get_max_value(test_values_df)

[(1052923, 'building_id'),
 (30, 'geo_level_1_id'),
 (1427, 'geo_level_2_id'),
 (12567, 'geo_level_3_id'),
 (8, 'count_floors_pre_eq'),
 (995, 'age'),
 (92, 'area_percentage'),
 (32, 'height_percentage'),
 (2, 'land_surface_condition'),
 (4, 'foundation_type'),
 (2, 'roof_type'),
 (4, 'ground_floor_type'),
 (3, 'other_floor_type'),
 (3, 'position'),
 (9, 'plan_configuration'),
 (1, 'has_superstructure_adobe_mud'),
 (1, 'has_superstructure_mud_mortar_stone'),
 (1, 'has_superstructure_stone_flag'),
 (1, 'has_superstructure_cement_mortar_stone'),
 (1, 'has_superstructure_mud_mortar_brick'),
 (1, 'has_superstructure_cement_mortar_brick'),
 (1, 'has_superstructure_timber'),
 (1, 'has_superstructure_bamboo'),
 (1, 'has_superstructure_rc_non_engineered'),
 (1, 'has_superstructure_rc_engineered'),
 (1, 'has_superstructure_other'),
 (3, 'legal_ownership_status'),
 (8, 'count_families'),
 (1, 'has_secondary_use'),
 (1, 'has_secondary_use_agriculture'),
 (1, 'has_secondary_use_hotel'),
 (1, 'has_

In [44]:
col_names = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id', 'age']

In [48]:
def normalize_columns(data_frame : pd.DataFrame):
    for col_name in col_names:
        values = data_frame[col_name].values
        scaler = MinMaxScaler(feature_range=(0, 10))
        scaled_values = scaler.fit_transform(values.reshape(-1, 1))
        data_frame[col_name] = pd.Series(scaled_values.flatten())

In [49]:
normalize_columns(train_df)

In [50]:
normalize_columns(test_values_df)

## Save normalized columns

In [51]:
train_df.to_csv("../datasets/preprocessed/train_normalize.csv", index=False, header=True)
test_values_df.to_csv("../datasets/preprocessed/test_normalize.csv", index=False, header=True)