In [1]:
import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from datetime import date
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt

In [2]:
# Read csvs

train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)


In [30]:
# Filtering columns on sets
le = LabelEncoder()

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4, 'Mega Projects': 5}
y_train = pd.DataFrame(train_df['change_type'].apply(lambda x: change_type_map[x]))

def get_x(df):
    x = pd.DataFrame()

    # status dates
    for i in range(1,6):
        x['change_status_date{}'.format(i)] = le.fit_transform(df['change_status_date{}'.format(i)])

    # dates
    # for i in range(1,6):
    #     x['year{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-4:-1]+x[-1]))
    # for i in range(1,6):
    #     x['month{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-7:-5]))
    # x['delta_year'] = train_df['date5'].transform(lambda x: x[-4:-1]+x[-1]).astype(int).subtract(train_df['date1'].transform(lambda x: x[-4:-1]+x[-1]).astype(int))

    # times intervals
    # def date_type(t1,t2):
    #     t = np.zeros(len(t1))
    #     for i in range(len(t1)):
    #         t1[i]="-".join([t1[i][6:10],t1[i][3:5],t1[i][:2]])
    #         t2[i]="-".join([t2[i][6:10],t2[i][3:5],t2[i][:2]])
    #         t[i]=(date.fromisoformat(t2[i])-date.fromisoformat(t1[i])).days
    #     return t
    # for i in range(1,5):
    #     t1 = np.array(df['date{}'.format(i)])
    #     t2 = np.array(df['date{}'.format(i+1)])
    #     x['t_int{}'.format(i)] = date_type(t1,t2)

    # total projet time
    # total_project_time=[]
    # for k in range(df.shape[0]):
    #     t_start=1
    #     i=2
    #     while i<6 and x['change_status_date{}'.format(i)][k] == x['change_status_date{}'.format(i-1)][k] and i<6:
    #         t_start=i
    #         i+=1
    #     t_end = 5

    #     i = 5
    #     while i>1 and x['change_status_date{}'.format(i)][k] == x['change_status_date{}'.format(i-1)][k] :
    #         t_start = i-1
    #         i -= 1
    #     project_time=0
    #     for i in range(t_start,t_end):
    #         project_time += x['t_int{}'.format(i)][k]
    #     total_project_time.append(project_time)
    # x['total_project_time'] = np.array(total_project_time)

    # urban types
    x['urban_types'] = le.fit_transform(df['urban_types'])

    # geography types
    # geography_types_map = {'River': 0,'Sparse Forest': 1,'Grass Land': 2,'Farms': 3,'Lakes': 4,'Barren Land': 5,'Coastal': 6,'Dense Forest': 7,'None': 8,'Hills': 9,'Desert': 10,'Snow': 11}
    # dic_geo = {}
    # for geography_type in geography_types_map:
    #     dic_geo[geography_type] = np.zeros(df.shape[0])

    # geo = np.array(df['geography_types'])
    # for k in range(len(geo)):
    #     geo[k] = geo[k].split(',')
    #     for geo_type in geo[k]:
    #         dic_geo[geo_type][k] = 1

    # for geography_type in geography_types_map:
    #     x[geography_type] = dic_geo[geography_type]

    # geometry features
    # area
    x['area'] = df[['geometry']].area
    # perimeter
    x['perimeter'] = df[['geometry']].length
    # centroid of the polygon
    x['x_centroid'] = df[['geometry']].centroid.x
    x['y_centroid'] = df[['geometry']].centroid.y
    # # length on the x and y axis
    x['delta_x'] = df[['geometry']].bounds.maxx.subtract(df[['geometry']].bounds.minx)
    x['delta_y'] = df[['geometry']].bounds.maxy.subtract(df[['geometry']].bounds.miny)
    # angles
    x['angle_diago'] = (x['delta_x'].div((x['delta_x'].apply(lambda x: x**2) + x['delta_y'].apply(lambda x: x**2)).apply(np.sqrt))).apply(np.arccos)
    # def coord_lister(geom):
    #     coords = list(geom.exterior.coords)
    #     return (coords)
    # coordinates_list = pd.DataFrame(train_df.geometry.apply(coord_lister))
    # x1 = coordinates_list.apply(lambda x: x[0][0][0], axis=1)
    # y1 = coordinates_list.apply(lambda x: x[0][0][1], axis=1)
    # x2 = coordinates_list.apply(lambda x: x[0][3][0], axis=1)
    # y2 = coordinates_list.apply(lambda x: x[0][3][1], axis=1)
    # x['angle'] = (x1*x2 + y1*y2).div((x1**2 + y1**2).apply(np.sqrt) * (x2**2 + y2**2).apply(np.sqrt)).apply(lambda x: x*np.pi/180).apply(np.arccos)

    return x

x_train = get_x(train_df)
x_test = get_x(test_df)
print(x_train.shape)
print(x_test.shape)

(310006, 13)
(121704, 13)


In [31]:
# Resampling to tackle unbalanced data
over_sampling = False
under_sampling = False

if over_sampling:
    from imblearn.over_sampling import SMOTE
    from imblearn.combine import SMOTEENN, SMOTETomek

    samplers = [SMOTE(random_state=0), SMOTEENN(random_state=0), SMOTETomek(random_state=0)]
    sampler = samplers[2]
    x_res, y_res = sampler.fit_resample(x_train, y_train)
    print(x_res.shape)

if under_sampling:
    from imblearn.pipeline import make_pipeline
    from imblearn.metrics import classification_report_imbalanced
    from imblearn.under_sampling import NearMiss
    from sklearn.preprocessing import StandardScaler
    from sklearn.linear_model import LogisticRegression

    # Create a pipeline
    pipeline = make_pipeline(NearMiss(version=2), StandardScaler(), LogisticRegression(random_state=42))
    pipeline.fit(x_train, y_train)

    # Classify and report the results
    # print(classification_report_imbalanced(y_test, pipeline.predict(x_test)))

In [32]:
polynomial_features = True
if polynomial_features:
    from sklearn.preprocessing import PolynomialFeatures

    poly = PolynomialFeatures(2, interaction_only=True)
    x_train = poly.fit_transform(x_train)
    x_test = poly.fit_transform(x_test)
    print(x_train.shape)

(310006, 92)


In [33]:
# Selecting the important features with PCA

gridsearch = False
indices = range(1, x_train.shape[1], 10)
if gridsearch:
    s=[]
    for n in indices:
        pca = PCA(n_components=n)
        pca.fit(x_train)
        s.append(pca.score(x_train))
        
    plt.plot(list(indices),s)

do_PCA = False
if do_PCA:
    pca = PCA(n_components=20)
    x_train_reduced = pca.fit_transform(x_train)
    x_test_reduced = pca.fit_transform(x_test)
    print(x_test_reduced.shape)

In [34]:
# splitting the data
random_state = 42
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size=0.3, random_state=random_state)
x_train_split.shape

(217004, 92)

In [35]:
# Choosing the classifier

tree = False
random_forest = True
adaboost = False

In [36]:
# Simple decision tree

if tree:
    clf = DecisionTreeClassifier(max_depth=8)
    clf.fit(x_train_split, y_train_split)

    # predict & assess
    y_pred_train = clf.predict(x_train_split)
    y_pred_test = clf.predict(x_test_split)

    print(accuracy_score(y_train_split, y_pred_train))
    print(accuracy_score(y_test_split, y_pred_test))


In [37]:
# Random forest

if random_forest:
    rf_clf = RandomForestClassifier(max_depth=15, random_state=0)
    rf_clf.fit(x_train, y_train)

    # predict & assess
    y_pred_train_rf = rf_clf.predict(x_train)
    # y_pred_test_rf = rf_clf.predict(x_test_split)

    print(accuracy_score(y_train, y_pred_train_rf))
    # print(accuracy_score(y_test_split, y_pred_test_rf))

In [40]:
# Adaboost
from sklearn.ensemble import AdaBoostClassifier

if adaboost:
    ab_clf = AdaBoostClassifier(n_estimators=50, random_state=0)
    ab_clf.fit(x_train, y_train)

    # predict & assess
    y_pred_train_ab = ab_clf.predict(x_train)
    # y_pred_test_ab = ab_clf.predict(x_test_split)

    print(accuracy_score(y_train, y_pred_train_ab))
    # print(accuracy_score(y_test_split, y_pred_test_ab))

  return f(**kwargs)


0.6508486932510984


In [42]:
# Save results to submission file

def save_results(clf, name_file, x_kaggle=x_test):
    y_pred_kaggle = pd.DataFrame(clf.predict(x_kaggle), columns=['change_type'])
    y_pred_kaggle.to_csv(name_file+"_sample_submission.csv", index=True, index_label='Id')

save_results(ab_clf, "adaboost")