In [1]:
import csv
import random
from scipy import spatial
from scipy.stats import multivariate_normal
import matplotlib.pyplot as plt
from numpy import linspace
import simplekml
from pylab import *
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Importing the notebook

In [2]:
import io, os, sys, types
from IPython import get_ipython
from nbformat import current
from IPython.core.interactiveshell import InteractiveShell


def find_notebook(fullname, path=None):
    name = fullname.rsplit('.', 1)[-1]
    if not path:
        path = ['']
    for d in path:
        nb_path = os.path.join(d, name + ".ipynb")
        if os.path.isfile(nb_path):
            return nb_path
        
        nb_path = nb_path.replace("_", " ")
        if os.path.isfile(nb_path):
            return nb_path

class NotebookLoader(object):
    def __init__(self, path=None):
        self.shell = InteractiveShell.instance()
        self.path = path

    def load_module(self, fullname):
        
        path = find_notebook(fullname, self.path)

        print ("importing Jupyter notebook from %s" % path)

        with io.open(path, 'r', encoding='utf-8') as f:
            nb = current.read(f, 'json')

        mod = types.ModuleType(fullname)
        mod.__file__ = path
        mod.__loader__ = self
        mod.__dict__['get_ipython'] = get_ipython
        sys.modules[fullname] = mod

        save_user_ns = self.shell.user_ns
        self.shell.user_ns = mod.__dict__

        try:
            for cell in nb.worksheets[0].cells:
                if cell.cell_type == 'code' and cell.language == 'python':
                    
                    code = self.shell.input_transformer_manager.transform_cell(cell.input)
                    
                    exec(code, mod.__dict__)
        finally:
            self.shell.user_ns = save_user_ns
        return mod


class NotebookFinder(object):
    def __init__(self):
        self.loaders = {}

    def find_module(self, fullname, path=None):
        nb_path = find_notebook(fullname, path)
        if not nb_path:
            return

        key = path
        if path:
            
            key = os.path.sep.join(path)

        if key not in self.loaders:
            self.loaders[key] = NotebookLoader(path)
        return self.loaders[key]

sys.meta_path.append(NotebookFinder())

import PierreMegret_Predictor

importing Jupyter notebook from PierreMegret_Predictor.ipynb


In [3]:
from PierreMegret_Predictor import PierrePredictor

# Importing the data set

In [4]:
data = np.genfromtxt('trn_data.csv', delimiter=',', skip_header=1)
X, Y = data[:,:-1], data[:,-1:]

In [5]:
predictor=PierrePredictor(data)

# Cross - validation

#### Creation of the k - fold.

In [6]:
predictor.n_dim

414

In [8]:
414 / 9

46

In [9]:
np.random.shuffle(data)
X_random, Y_random = data[:,:-1], data[:,-1:]
X_folds = np.array_split(X_random, 9)
Y_folds = np.array_split(Y_random, 9)

#### Definition of the RMSE function.

In [10]:
def rmse(predictions, targets):
            return np.sqrt(((predictions - targets) ** 2).mean())

## Cross - Validation on both h & sigma, using fsim

In [11]:
liste_rmse=[]
minimum_h=[]
minimum_sigma=[]
liste_h_sigma=[]

for i in range(9):
    X_train_cv = list(X_folds)
    X_test_cv  = X_train_cv.pop(i)
    X_train_cv = np.concatenate(X_train_cv)
    
    Y_train_cv = list(Y_folds)
    Y_test_cv  = Y_train_cv.pop(i)
    Y_train_cv = np.concatenate(Y_train_cv)
    
    for h in np.arange(0.05,3,0.05):
        for sigma in np.arange(0.05,1,0.05):
          
            mean = predictor.mean(h,sigma,X_train_cv,X_test_cv,Y_train_cv)
            
            liste_h_sigma.append([h,sigma])
            liste_rmse.append(rmse(mean,Y_test_cv))

    every_minimum=liste_h_sigma[liste_rmse.index(min(liste_rmse))]
    
    minimum_sigma.append(every_minimum[1])
    minimum_h.append(every_minimum[0])

    liste_rmse=[]
    liste_h_sigma=[]
    
best_h=sum(minimum_h)/float(len(minimum_h))
best_sigma = sum(minimum_sigma)/float(len(minimum_sigma))

print best_h, best_sigma

0.733333333333 0.233333333333


#### I fix h and sigma because their calculation take a very long time.

In [12]:
h_chosen_fixed = 0.733333333333
sigma_chosen_fixed = 0.233333333333

## Importation of the test file

In [13]:
test = np.genfromtxt('tst_locations.csv', delimiter=',', skip_header=1)
print test.shape, test

(413L, 2L) [[  41.988 -123.72 ]
 [  41.883 -124.13 ]
 [  41.833 -123.83 ]
 [  41.999 -121.7  ]
 [  41.928 -122.44 ]
 [  41.858 -123.35 ]
 [  41.687 -122.6  ]
 [  41.333 -123.5  ]
 [  41.479 -122.45 ]
 [  41.6   -122.93 ]
 [  41.272 -122.72 ]
 [  41.302 -122.53 ]
 [  41.112 -122.7  ]
 [  41.089 -122.71 ]
 [  41.083 -122.72 ]
 [  41.05  -123.67 ]
 [  40.825 -122.66 ]
 [  40.733 -123.2  ]
 [  40.727 -122.79 ]
 [  40.69  -122.83 ]
 [  40.679 -122.83 ]
 [  40.625 -122.91 ]
 [  40.5   -123.33 ]
 [  40.906 -123.82 ]
 [  40.463 -123.52 ]
 [  40.319 -123.37 ]
 [  40.218 -123.63 ]
 [  40.182 -123.78 ]
 [  40.138 -123.82 ]
 [  39.833 -123.08 ]
 [  39.725 -122.85 ]
 [  39.667 -123.32 ]
 [  39.408 -122.96 ]
 [  40.258 -124.25 ]
 [  39.367 -123.13 ]
 [  39.351 -123.32 ]
 [  39.197 -123.19 ]
 [  38.905 -123.23 ]
 [  38.727 -122.84 ]
 [  38.716 -123.   ]
 [  38.479 -122.71 ]
 [  38.25  -122.96 ]
 [  38.083 -122.95 ]
 [  38.317 -122.63 ]
 [  38.233 -122.64 ]
 [  38.492 -122.53 ]
 [  38.035 -122.27 ]
 [

# Prediction with m(f) & exportation as a csv file

In [14]:
# Predictive mean #

mean = predictor.mean(h_chosen_fixed,sigma_chosen_fixed,X,test,Y)

# Exportation as a csv file for the kaggle competition #

t = open('Predicitve_mean_m(f).csv', 'w')
open_file_object = csv.writer(t)

open_file_object.writerow(['id','mm'])

for i in range(test.shape[0]):
    open_file_object.writerow([i+1,int(mean[i])])

t.close()

In [15]:
mean

array([[  57.75322463],
       [  27.27602063],
       [  47.93745074],
       [  67.14426592],
       [  86.59624044],
       [  78.95999866],
       [  93.11361458],
       [  67.50444382],
       [  90.6914733 ],
       [  91.98140242],
       [  87.85163659],
       [  87.04851913],
       [  82.93257202],
       [  82.37232658],
       [  82.35079259],
       [  59.78270407],
       [  72.96675428],
       [  83.56216589],
       [  75.16117203],
       [  76.07292926],
       [  75.913318  ],
       [  78.36430647],
       [  83.21371865],
       [  53.34282393],
       [  76.81760882],
       [  81.62647052],
       [  67.87851728],
       [  57.24762681],
       [  53.19573472],
       [  76.9616605 ],
       [  71.68425478],
       [  66.04639102],
       [  69.60076002],
       [  31.03622187],
       [  64.19365288],
       [  53.65417187],
       [  55.32698962],
       [  41.21141002],
       [  53.62142195],
       [  46.12674657],
       [  47.82478646],
       [  28.503

# Simulation

In [16]:
grid = np.genfromtxt('grid.csv', delimiter=',')
print grid.shape, grid

(2500L, 2L) [[  38.5   -120.8  ]
 [  38.516 -120.8  ]
 [  38.533 -120.8  ]
 ..., 
 [  39.267 -119.8  ]
 [  39.284 -119.8  ]
 [  39.3   -119.8  ]]


In [17]:
fsim = predictor.f_sim(h_chosen_fixed,sigma_chosen_fixed,X,grid,Y)
fsim

array([[ 56.22864278],
       [ 56.76344923],
       [ 57.2534191 ],
       ..., 
       [ 96.80418184],
       [ 96.28927477],
       [ 95.81428471]])

#### I save fsim in order to use it after, without having to recalculate everything

In [18]:
d=fsim.tolist()

fsim_csv = open('fsim_output.csv', 'w')
open_file_object = csv.writer(fsim_csv)

for i in range(grid.shape[0]):
    open_file_object.writerow(d[i])

fsim_csv.close()

# Visualization

In [6]:
# Importation of fsim #

fsim_2500 = np.genfromtxt('fsim_output.csv')

In [7]:
print fsim_2500.shape, fsim_2500

(2500L,) [ 56.22864278  56.76344923  57.2534191  ...,  96.80418184  96.28927477
  95.81428471]


In [8]:
predictor.visualization(fsim_2500,50,[38.5,39.3,-119.8,-120.8])