In [1]:
import geopandas as gpd
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Read csvs

train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)


KeyboardInterrupt: 

In [3]:
# Filtering columns on training set
le = LabelEncoder()

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4, 'Mega Projects': 5}
y_train = pd.DataFrame(train_df['change_type'].apply(lambda x: change_type_map[x]))

def get_x(df):
    x = pd.DataFrame()

    # status dates
    for i in range(1,6):
        x['change_status_date{}'.format(i)] = le.fit_transform(df['change_status_date{}'.format(i)])

    # dates
    for i in range(1,6):
        x['year{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-4:-1]+x[-1]))
    for i in range(1,6):
        x['month{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-7:-5]))

    # urban types
    x['urban_types'] = le.fit_transform(df['urban_types'])

    # geography types
    geography_types_map = {'River': 0,'Sparse Forest': 1,'Grass Land': 2,'Farms': 3,'Lakes': 4,'Barren Land': 5,'Coastal': 6,'Dense Forest': 7,'None': 8,'Hills': 9,'Desert': 10,'Snow': 11}
    for geography_type in geography_types_map:
        x[geography_type] = 0

    for i, geography_features in enumerate(df.loc[:, ('geography_types')]):
        features = geography_features.split(',')
        for feature in features:
            x[feature].loc[i] = 1

    # geometry features
    # area
    x['area'] = df[['geometry']].area
    # perimeter
    x['perimeter'] = df[['geometry']].length
    # centroid of the polygon
    x['x_centroid'] = df[['geometry']].centroid.x
    x['y_centroid'] = df[['geometry']].centroid.y
    # length on the x and y axis
    x['delta_x'] = df[['geometry']].bounds.maxx.subtract(df[['geometry']].bounds.minx)
    x['delta_y'] = df[['geometry']].bounds.maxy.subtract(df[['geometry']].bounds.miny)
    # angle of the diagonal of the rectangle made by delta_x and delta_y
    #x['angle_diago'] = (x['delta_x'].div((x['delta_x'].apply(lambda x: x**2) + x['delta_y'].apply(lambda x: x**2)).apply(np.sqrt))).apply(np.arccos)

    return x

x_train = get_x(train_df)
x_test = get_x(test_df)
print(x_train.shape)
print(x_test.shape)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x[feature][i] = 1


(310006, 35)
(121704, 35)


In [4]:
# Selecting the important features with PCA
def PCA(x_train):
    C = x_train-x_train.mean(axis=0)
    W = C.T.dot(C)

    eigval, eigvec = np.linalg.eig(W)
    # eigen_matrix = np.diag(eigval)

    k = int() # numbers of columns we keep
    s = sum(eigval)
    for i in range(len(eigval)):
        if sum(eigval[0:i])/s > 0.9: # we keep 90% of the variance explained
            k = i
            break
    print('k =',k)


    x_train_reduced = C.dot(eigvec[:,0:k])
    return x_train_reduced

x_train_reduced = PCA(x_train)
x_test_reduced = PCA(x_test)
x_test_reduced

k = 2
k = 2


Unnamed: 0,0,1
0,19.332083,8.682976
1,19.332825,8.682661
2,19.333024,8.682759
3,19.293205,8.602195
4,19.279324,8.575139
...,...,...
121699,27.501463,0.639274
121700,27.457798,0.553987
121701,27.515220,0.663227
121702,27.504204,0.644691


In [5]:
# splitting the data
random_state = 42
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train_reduced, y_train, test_size=0.3, random_state=random_state)
x_train_split

Unnamed: 0,0,1
275077,-46.607180,18.791978
85094,-8.194710,-0.123491
132322,9.252414,-1.403904
209323,8.722770,-21.205001
80605,39.802297,-3.543395
...,...,...
119879,-200.323661,-48.402624
259178,38.618355,-6.142844
131932,9.369841,-1.734225
146867,33.815799,-8.033551
