In [5]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split, cross_val_score, KFold, LeaveOneOut
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from scipy.spatial import distance_matrix
from tqdm import tqdm
from multiprocessing import Pool
import time

# Fit linear regression to the data

In [2]:
# Create grid
GRID_SIZE = 50
latitude = np.repeat(range(GRID_SIZE),GRID_SIZE)
longitude = np.tile(np.arange(GRID_SIZE),GRID_SIZE)
size = len(longitude)
x = np.random.uniform(0,10,size)
x1 = np.random.uniform(0,10,size)
x2 = np.random.uniform(0,10,size)
data = np.random.choice(a = ['a','b','c'], size = size, p=[0.5, 0.3, 0.2])
y = x**2 + np.random.normal(0,1,size)
df = pd.DataFrame({'longitude':longitude,'latitude':latitude,'x':x,'x1':x1,'x2':x2,'y':y,'class':data})
numeric_features = ['x','x1','x2']
categorical_features = ['class']
location = ['longitude', 'latitude']
X,y = df[numeric_features+location+categorical_features],df[['y']]

# IDW using scikit learn KNeighborsRegressor

In [3]:
def baseline(dictionary, X_train, y_train, X_test, location):
    """Create base line models for comparison purposes"""
    idw = KNeighborsRegressor(weights='distance')
    predicted = idw.fit(X_train[location],y_train)
    dictionary['IDW'] = idw.predict(X_test[location])[0][0]
    dictionary['Mean_imput'] = np.mean(y_train.iloc[:,0])
    return dictionary
    
def regression(reg_dict, dictionary, X_train, X_test, y_train,  location, string):
    """Regression models"""
    for name,reg in reg_dict.items():
        name = name+string
        reg.fit(X_train[location],y_train.values.ravel())
        dictionary[name] = reg.predict(X_test[location])[0]
    return dictionary

def IDW_per_feature(X_train, X_test, numeric_features, location):
    """ IDW per feature"""
    X_predicted = X_test.copy()
    for col in numeric_features:
        idw = KNeighborsRegressor(weights='distance')
        predicted = idw.fit(X_train[location],X_train[col])
        X_predicted[col] = idw.predict(X_test[location])
        return X_predicted

def create_regression_models():
    """ Create regression models """
    reg_RF = RandomForestRegressor()
    reg_GB = GradientBoostingRegressor()
    return {'rff':reg_RF,'xgb':reg_GB}

def return_index_point_outside_radius(X_train, X_test, distance,location):
    """Return index where the points lie outside of the circle"""
    distance_np = distance_matrix(X_train[location],X_test[location])
    x_test_index = X_test.index.values[0]
    distance_df = pd.DataFrame(distance_np)
    new_index = {i: i+1 if i >= x_test_index else i for i in range(len(distance_np))}
    distance_df = distance_df.rename(index=new_index)
    return distance_df[distance_df.iloc[:,0]>distance].index.to_numpy()


def return_same_class(X_train, y_train, X_test):
    """Returns only the values that share the same class as the gridcell at X_test"""
    X_train_new = X_train[X_train['class'] == X_test['class'].iloc[0]]
    y_train_new = y_train.loc[X_train_new.index.to_numpy()]
    
    return X_train_new,y_train_new
    

In [4]:
loo = LeaveOneOut()
prediction_dict = {}
distance = 10
start = time.perf_counter()
for i, (train_index, test_index) in tqdm(enumerate(loo.split(X))):
    # Separeta the data
    X_train_og, X_test_og, y_train_og, y_test_og = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
    # Cut out the buffer here
    index_cut = return_index_point_outside_radius(X_train_og, X_test_og, distance,location)
    X_train_cut, y_train_cut = X_train_og.loc[index_cut], y_train_og.loc[index_cut]
    # Choose only the points that is the same category as y_test
    X_train, y_train = return_same_class(X_train_cut, y_train_cut, X_test_og)
    y_test, X_test = y_test_og, X_test_og
    base_dict = {}
    base_dict['CSO'] = y_test.values[0][0]
    # Baseline calculation
    base_dict = baseline(base_dict, X_train, y_train, X_test,location)
    
    # Regression only location, no additional variables
    reg_dict = create_regression_models()
    base_dict = regression(reg_dict, base_dict, X_train, X_test, y_train,  location, '')
        
    # IDW on the test point for the features
    X_predicted = IDW_per_feature(X_train, X_test, numeric_features, location)
    
    # Regression using additional variables
    reg_dict_additional = create_regression_models()
    base_dict = regression(reg_dict_additional,base_dict, X_train, X_predicted, y_train, location, 'additional')
    
    # Update dictionary at test location
    prediction_dict[test_index[0]] = base_dict
finish  = time.perf_counter()
print(f'{finish-start:.3f}')

NameError: name 'time' is not defined

In [None]:
def process_data(args):
    i, (train_index, test_index), X, y, numeric_features, location, distance = args
    X_train_og, X_test_og, y_train_og, y_test_og = X.loc[train_index], X.loc[test_index], y.loc[train_index], y.loc[test_index]
    # Cut out the buffer here
    index_cut = return_index_point_outside_radius(X_train_og, X_test_og, distance,location)
    X_train_cut, y_train_cut = X_train_og.loc[index_cut], y_train_og.loc[index_cut]
    # Choose only the points that is the same category as y_test
    X_train, y_train = return_same_class(X_train_cut, y_train_cut, X_test_og)
    y_test, X_test = y_test_og, X_test_og
    base_dict = {}
    base_dict['CSO'] = y_test.values[0][0]
    # Baseline calculation
    base_dict = baseline(base_dict, X_train, y_train, X_test,location)

    # Regression only location, no additional variables
    reg_dict = create_regression_models()
    base_dict = regression(reg_dict, base_dict, X_train, X_test, y_train,  location, '')
        
    # IDW on the test point for the features
    X_predicted = IDW_per_feature(X_train, X_test, numeric_features, location)
    
    # Regression using additional variables
    reg_dict_additional = create_regression_models()
    base_dict = regression(reg_dict_additional,base_dict, X_train, X_predicted, y_train, location, 'additional')
    
    # Update dictionary at test location
    # prediction_dict[test_index[0]] = base_dict
    return (test_index[0],base_dict)

In [None]:
start = time.perf_counter()
prediction_dict = {}
if __name__ == '__main__':
    pool = Pool(cpu_count())
    args = [(i, (train_index, test_index), X, y, numeric_features, location, distance) for i, (train_index, test_index) in enumerate(loo.split(X))]
    prediction_dict = dict(tqdm(pool.imap_unordered(process_data, args), total=len(args)))
    pool.close()
    pool.join()
finish  = time.perf_counter()
print(f'{finish-start:.3f}')

In [None]:
pd.DataFrame.from_dict(prediction_dict).T