In [1]:
import geopandas as gpd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import datasets
from sklearn.metrics import classification_report
import scipy.optimize as op

In [2]:
# Read csvs

train_df = gpd.read_file('train.geojson', index_col=0)
test_df = gpd.read_file('test.geojson', index_col=0)

In [4]:
# Filtering columns on training set
le = LabelEncoder()

change_type_map = {'Demolition': 0, 'Road': 1, 'Residential': 2, 'Commercial': 3, 'Industrial': 4, 'Mega Projects': 5}
y_train = pd.DataFrame(train_df['change_type'].apply(lambda x: change_type_map[x]))

def get_x(df):
    x = pd.DataFrame()

    # status dates
    for i in range(1,6):
        x['change_status_date{}'.format(i)] = le.fit_transform(df['change_status_date{}'.format(i)])

    # dates
    for i in range(1,6):
        x['year{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-4:-1]+x[-1]))
    for i in range(1,6):
        x['month{}'.format(i)] = le.fit_transform(df['date{}'.format(i)].transform(lambda x: x[-7:-5]))

    # urban types
    x['urban_types'] = le.fit_transform(df['urban_types'])

    # geography types
    geography_types_map = {'River': 0,'Sparse Forest': 1,'Grass Land': 2,'Farms': 3,'Lakes': 4,'Barren Land': 5,'Coastal': 6,'Dense Forest': 7,'None': 8,'Hills': 9,'Desert': 10,'Snow': 11}
    for geography_type in geography_types_map:
        x[geography_type] = 0

    for i, geography_features in enumerate(df.loc[:, ('geography_types')]):
        features = geography_features.split(',')
        for feature in features:
            x[feature].loc[i] = 1

    # geometry features
    # area
    x['area'] = df[['geometry']].area
    # perimeter
    x['perimeter'] = df[['geometry']].length
    # centroid of the polygon
    x['x_centroid'] = df[['geometry']].centroid.x
    x['y_centroid'] = df[['geometry']].centroid.y
    # length on the x and y axis
    x['delta_x'] = df[['geometry']].bounds.maxx.subtract(df[['geometry']].bounds.minx)
    x['delta_y'] = df[['geometry']].bounds.maxy.subtract(df[['geometry']].bounds.miny)
    # angle of the diagonal of the rectangle made by delta_x and delta_y
    #x['angle_diago'] = (x['delta_x'].div((x['delta_x'].apply(lambda x: x**2) + x['delta_y'].apply(lambda x: x**2)).apply(np.sqrt))).apply(np.arccos)

    return x

x_train = get_x(train_df)
x_test = get_x(test_df)
print(x_train.shape)
print(x_test.shape)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)

  x['area'] = df[['geometry']].area

  x['perimeter'] = df[['geometry']].length

  x['x_centroid'] = df[['geometry']].centroid.x

  x['y_centroid'] = df[['geometry']].centroid.y


(310006, 34)
(121704, 34)


In [66]:
random_state = 42
x_train_split, x_test_split, y_train_split, y_test_split = train_test_split(x_train, y_train, test_size=0.3, random_state=random_state)
x_train_split, x_test_split, y_train_split, y_test_split=x_train_split.values[:200,:], x_test_split.values[:200,:], y_train_split.values[:200,:], y_test_split.values[:200,:]
X,y=x_train_split,y_train_split
theta = [0]*x_train_split.shape[1]

In [67]:
def sigmoid(x):
    # Activation function used to map any real value between 0 and 1
    return 1 / (1 + np.exp(-x))

def net_input(theta, x):
    # Computes the weighted sum of inputs
    return np.dot(x, theta)

def probability(theta, x):
    # Returns the probability after passing through sigmoid
    return sigmoid(net_input(theta, x))

In [68]:
def cost_function(self, theta, x, y):
    # Computes the cost function for all the training samples
    m = x.shape[0]
    total_cost = -(1 / m) * np.sum(
        y * np.log(probability(theta, x)) + (1 - y) * np.log(
            1 - probability(theta, x)))
    return total_cost

def gradient(self, theta, x, y):
    # Computes the gradient of the cost function at the point theta
    m = x.shape[0]
    return (1 / m) * np.dot(x.T, sigmoid(net_input(theta,   x)) - y)

In [69]:
from scipy.optimize import fmin_tnc

In [70]:

def fit(self, x, y, theta):
    opt_weights = fmin_tnc(func=cost_function, x0=theta,
                  fprime=gradient,args=(x, y.flatten()))
    return opt_weights[0]

parameters = fit(X, y,theta)

TypeError: fit() missing 1 required positional argument: 'theta'

In [41]:
# Sigmoid function
def sigmoid(z):
    return 1/(1+np.exp(-z))

# Logit function
def logit(z):
    return np.log(z/(1-z))

In [42]:
# Dependence on X,y is implicit
# X, y are defined globally
def computeCost(theta):
    J1 = np.dot(y_train_split.T, np.log(sigmoid(x_train_split.dot(theta))))
    J2 = np.dot((1-y_train_split).T,np.log(1-sigmoid(x_train_split.dot(theta))))
    return (-1/x_train.shape[0]) * (J1 + J2)

In [43]:
# Dependence on X,y is implicit
# X, y are defined globally
def computeGrad(theta):
    # Computes the gradient of the cost with respect to
    # the parameters.
    
    return (-1/x_train_split.shape[0]) * np.dot(x_train_split.T, (y_train_split - sigmoid(x_train_split.dot(theta))))

In [52]:
# Initial value
theta = [0]*x_train_split.shape[1]

# Run minimize() to obtain the optimal theta
#Result = op.minimize(fun=computeCost, x0=theta, method = 'TNC', jac=computeGrad);
parameters=fmin_tnc(func=computeCost, x0=theta, fprime=computeGrad, approx_grad=True, args=(x_train_split, y_train_split.flatten()))[0]
#theta = Result.x
parameters

NameError: name 'fmin_tnc' is not defined

In [None]:
def predict(theta, X):
    # Predict whether the label is 0 or 1 using learned logistic 
    # regression parameters theta. The threshold is set at 0.5
    
    prob = sigmoid(X.dot(theta))
    pred = np.where(prob >= 0.5, 1, 0)
    return pred

In [None]:
# Compute accuracy on the training set
p = predict(Result.x, x_test_split)
counter = 0
for i in range(y_test_split.size):
    if p[i] == y_test_split[i]:
        counter += 1
print('Train Accuracy: {:.2f}'.format(counter / float(y_test_split.size) * 100.0))