In [1]:
# Importing dependences
import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures

In [2]:
# Read csvs

train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)

In [3]:
# Preprocessing of the data
le = LabelEncoder()

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4, 'Mega Projects': 5}
y_train = pd.DataFrame(train_df['change_type'].apply(lambda x: change_type_map[x]))

def get_x(df):
    x = pd.DataFrame()

    # status dates
    for i in range(1,6):
        x['change_status_date{}'.format(i)] = le.fit_transform(df['change_status_date{}'.format(i)])


    # urban types
    x['urban_types'] = le.fit_transform(df['urban_types'])


    # geometry features
    # area
    x['area'] = df[['geometry']].area
    # perimeter
    x['perimeter'] = df[['geometry']].length
    # centroid of the polygon
    x['x_centroid'] = df[['geometry']].centroid.x
    x['y_centroid'] = df[['geometry']].centroid.y
    # # length on the x and y axis
    x['delta_x'] = df[['geometry']].bounds.maxx.subtract(df[['geometry']].bounds.minx)
    x['delta_y'] = df[['geometry']].bounds.maxy.subtract(df[['geometry']].bounds.miny)
    # angle of the diagonal and the base of the rectangle made by delta_x and delta_y
    x['angle_diago'] = (x['delta_x'].div((x['delta_x'].apply(lambda x: x**2) + x['delta_y'].apply(lambda x: x**2)).apply(np.sqrt))).apply(np.arccos)
    

    return x

x_train = get_x(train_df)
x_test = get_x(test_df)
print(x_train.shape)
print(x_test.shape)

(310006, 13)
(121704, 13)


In [4]:
# We compute polynomial features in order to tackle correlation between features

poly = PolynomialFeatures(2, interaction_only=True)
x_train = poly.fit_transform(x_train)
x_test = poly.fit_transform(x_test)

In [5]:
# Fit random forest classifier

rf_clf = RandomForestClassifier(max_depth=15, random_state=0)
rf_clf.fit(x_train, y_train)

# Predict and assess

y_pred_train_rf = rf_clf.predict(x_train)
print(accuracy_score(y_train, y_pred_train_rf))

  rf_clf.fit(x_train, y_train)


0.8155616342909492


In [6]:
# Save results to submission file

def save_results(clf, name_file, x_kaggle=x_test):
    y_pred_kaggle = pd.DataFrame(clf.predict(x_kaggle), columns=['change_type'])
    y_pred_kaggle.to_csv(name_file+"_sample_submission.csv", index=True, index_label='Id')

save_results(rf_clf, "random_forest")