In [1]:
import geopandas as gpd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import datasets
from sklearn.metrics import classification_report

In [2]:
# Read csvs

train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)

In [3]:
# Filtering columns on training set
le = LabelEncoder()

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4, 'Mega Projects': 5}
y_train = pd.DataFrame(train_df['change_type'].apply(lambda x: change_type_map[x]))

def get_x(df):
    x = pd.DataFrame()

    # status dates
    for i in range(1,6):
        x['change_status_date{}'.format(i)] = le.fit_transform(df['change_status_date{}'.format(i)])

    # dates
    for i in range(1,6):
        x['year{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-4:-1]+x[-1]))
    for i in range(1,6):
        x['month{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-7:-5]))

    # urban types
    x['urban_types'] = le.fit_transform(df['urban_types'])

    # geography types
    geography_types_map = {'River': 0,'Sparse Forest': 1,'Grass Land': 2,'Farms': 3,'Lakes': 4,'Barren Land': 5,'Coastal': 6,'Dense Forest': 7,'None': 8,'Hills': 9,'Desert': 10,'Snow': 11}
    dic_geo = {}
    for geography_type in geography_types_map:
        dic_geo[geography_type] = np.zeros(df.shape[0])

    geo = np.array(df['geography_types'])
    for k in range(len(geo)):
        geo[k] = geo[k].split(',')
        for geo_type in geo[k]:
            dic_geo[geo_type][k] = 1

    for geography_type in geography_types_map:
        x[geography_type] = dic_geo[geography_type]

    # geometry features
    # area
    x['area'] = df[['geometry']].area
    # perimeter
    x['perimeter'] = df[['geometry']].length
    # centroid of the polygon
    x['x_centroid'] = df[['geometry']].centroid.x
    x['y_centroid'] = df[['geometry']].centroid.y
    # length on the x and y axis
    x['delta_x'] = df[['geometry']].bounds.maxx.subtract(df[['geometry']].bounds.minx)
    x['delta_y'] = df[['geometry']].bounds.maxy.subtract(df[['geometry']].bounds.miny)
    # angle of the diagonal of the rectangle made by delta_x and delta_y
    #x['angle_diago'] = (x['delta_x'].div((x['delta_x'].apply(lambda x: x**2) + x['delta_y'].apply(lambda x: x**2)).apply(np.sqrt))).apply(np.arccos)

    return x

x_train = get_x(train_df)
x_test = get_x(test_df)
print(x_train.shape)
print(x_test.shape)

(310006, 34)
(121704, 34)


In [8]:
# splitting the data
random_state = 42
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size=0.3, random_state=random_state)
x_train_split, x_test_split, y_train_split, y_test_split=x_train_split.values[:10000,:], x_test_split.values[:10000,:], y_train_split.values[:10000,:], y_test_split.values[:10000,:]

In [9]:
class CustomSVC():
    """
    Custom class to build a classifier using a sklearn.svm.SVC object 
    fitted with a precomputed kernel
    
    Attributes
    ----------
    C: float
        Regularization parameter for SVC
    func_kernel:
        The kernel function used to compute the kernel matrix
    args_kernel:
        Positional arguments for func_kernel
    kwargs_kernel:
        Keyword arguments for func_kernel
        
    Methods
    -------
    fit(self, X, y):
        Fit the internal SVC object
    predict(self, X):
        Make predictions on the raw data X using the kernel_func and the SVC
    """
    def __init__(self, C, func_kernel, *args_kernel, **kwargs_kernel):
        self.C = C
        self.func_kernel = func_kernel
        self.args_kernel = args_kernel
        self.kwargs_kernel = kwargs_kernel
        
    def _get_K(self, X1, X2):
        return self.func_kernel(X1, X2, *self.args_kernel, **self.kwargs_kernel)
        
    def fit(self, X, y):
        self._svc = SVC(C=self.C, kernel="precomputed")
        self._Xtrain = X
        self._svc = self._svc.fit(self._get_K(X,X), y)
        return self
        
    def predict(self, X):
        return self._svc.predict(self._get_K(X, self._Xtrain))

In [10]:
def linear_kernel(X1, X2):
    """ 
    Computes the linear kernel matrix between every row of X1 and every row of X2. 
    
    Parameters
    ----------
    X1: array-like, (m, p)
    X2: array-like, (n, p)
    
    Returns
    -------
    K: array-like, (m, n)
    """
    # YOUR CODE HERE
    K = X1.dot(X2.T)
    return K

In [11]:

linear_svc = CustomSVC(1, linear_kernel)
linear_svc = linear_svc.fit(x_train_split,y_train_split)

  return f(**kwargs)


In [12]:
y_pred_linear = linear_svc.predict(x_test_split)
print('Classification report on training Linear SVC for Data 1 (trainset)')
print(classification_report(y_test_split, y_pred_linear, labels=np.unique(y_test_split)))

Classification report on training Linear SVC for Data 1 (trainset)
              precision    recall  f1-score   support

           0       0.50      0.20      0.29      1010
           1       0.00      0.00      0.00       469
           2       0.59      0.86      0.70      5311
           3       0.52      0.30      0.38      3156
           4       0.00      0.00      0.00        47
           5       0.00      0.00      0.00         7

    accuracy                           0.57     10000
   macro avg       0.27      0.23      0.23     10000
weighted avg       0.53      0.57      0.52     10000



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
from scipy.spatial.distance import cdist

In [14]:
def gaussian_kernel(X1, X2, gamma=1):
    """ 
    Computes the linear kernel matrix between every row of X1 and every row of X2. 
    
    Parameters
    ----------
    X1: array-like, (m, p)
    X2: array-like, (n, p)
    
    Returns
    -------
    K: array-like, (m, n)
    """
    # YOUR CODE HERE
    distance = cdist(X1, X2, 'euclidean')
    K = np.exp(- gamma * distance)
    return K

In [15]:
C = 1e6
gamma = 1
gaussian_svc = CustomSVC(C, gaussian_kernel, gamma=1)
gaussian_svc = gaussian_svc.fit(x_train_split,y_train_split)

  return f(**kwargs)


In [16]:
y_pred_gaussian = gaussian_svc.predict(x_test_split)
print('Classification report on training Linear SVC for Data 1 (trainset)')
print(classification_report(y_test_split, y_pred_gaussian, labels=np.unique(y_train_split)))

Classification report on training Linear SVC for Data 1 (trainset)
              precision    recall  f1-score   support

           0       0.79      0.22      0.34      1010
           1       0.21      0.01      0.02       469
           2       0.58      0.89      0.70      5311
           3       0.59      0.30      0.40      3156
           4       0.33      0.06      0.11        47
           5       0.00      0.00      0.00         7

    accuracy                           0.59     10000
   macro avg       0.42      0.25      0.26     10000
weighted avg       0.59      0.59      0.54     10000



  _warn_prf(average, modifier, msg_start, len(result))


Grid Search Exploration for C and  𝛾

In [17]:
from sklearn.model_selection import train_test_split, GridSearchCV

In [18]:
param_grid = {"C": np.logspace(-3, 3, 30),
              "gamma": np.logspace(2, 4, 30)}

grid = GridSearchCV(estimator=SVC(kernel="rbf"),
                    param_grid=param_grid,
                    scoring="accuracy",
                    return_train_score=True,
                    refit=True,
                    cv=5, verbose=1, n_jobs=4)

grid = grid.fit(x_train_split, y_train_split)
print("best params %s" % grid.best_params_)
print("best score %.3f" % grid.best_score_)

Fitting 5 folds for each of 900 candidates, totalling 4500 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:  4.5min
[Parallel(n_jobs=4)]: Done 192 tasks      | elapsed: 18.4min
[Parallel(n_jobs=4)]: Done 442 tasks      | elapsed: 54.5min


KeyboardInterrupt: 

In [None]:
# performance
y_pred_train_grid = grid.best_estimator_.predict(x_train_split)
y_pred_test_grid = grid.best_estimator_.predict(x_test_split)

print("Classification report train ...")
print(classification_report(y_train_split, y_pred_train_grid, labels=np.unique(y_train_split)))

print("Classification report test ...")
print(classification_report(y_test_split, y_pred_test_grid, labels=np.unique(y_test_split)))

Classification report train ...
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        20
           1       1.00      1.00      1.00         7
           2       0.99      1.00      1.00       117
           3       1.00      0.98      0.99        55
           4       1.00      1.00      1.00         1

    accuracy                           0.99       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      0.99      0.99       200

Classification report test ...
              precision    recall  f1-score   support

           0       0.00      0.00      0.00        15
           1       0.00      0.00      0.00         5
           2       0.61      1.00      0.76       121
           3       1.00      0.02      0.03        59

    accuracy                           0.61       200
   macro avg       0.40      0.25      0.20       200
weighted avg       0.66      0.61      0.47       200



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
