<h1>DATA LOADING

In [1]:
import pandas as pd
import numpy as np

In [2]:
import os
os.listdir('../../data')
assert 'out_breed.csv' in os.listdir('../../data') # this assert breaks if the data is configured uncorrectly

In [3]:
breeds = pd.read_csv('../../data/out_breed.csv')
colors = pd.read_csv('../../data/out_color.csv')
states = pd.read_csv('../../data/out_state.csv')
train  = pd.read_csv('../../data/out_train.csv')
test   = pd.read_csv('../../data/out_test.csv')
sub    = pd.read_csv('../../data/out_submission.csv')

In [4]:
# cats dogs
dogs = train[train['Type'] == 1].drop('Type',axis=1)
cats = train[train['Type'] == 2].drop('Type',axis=1)

In [5]:
import sys
sys.path.append('../')

<h1>NB MODEL</h1>

In [6]:
from NAIVE_BAYES.ensembleNaiveBayes import PredictiveModel as naiveBayesEnsemblePredictiveModel

In [9]:
string_cols = ["Unnamed: 0", "dataset_type", "Name", "RescuerID", "Description", "PetID"]
categorical_col = ["Gender","Vaccinated","Dewormed","Sterilized","Breed1","Breed2","Color1","Color2","Color3","State"]
numerical_col = [col for col in cats.columns if col not in string_cols and col not in categorical_col and col != "AdoptionSpeed"]
mapping_sizes = [2, 3, 3, 3, 307, 307, 7, 7, 7, 15]
    
def nb_prepare_data(population):
    """
    this prepare data to be fed inside a NB
    Args: subset of (rows) of train
    Return: X,Y
    """
    Y = population["AdoptionSpeed"]
    X = population.drop(string_cols, axis=1)
    if 'Type' in X.columns: X = X.drop('Type', axis=1)
    X = pd.concat([X[numerical_col], X[categorical_col]], axis=1)
    assert len(Y) == len(X)
    X = X.reset_index().drop('index',axis=1)
    Y = Y.reset_index().drop('index',axis=1)['AdoptionSpeed']
    return X,Y

def nb_run(population):
    """
    this validate NB on given population
    Args: subset of (rows) of train
    Return: score
    """
    X, Y = nb_prepare_data(population)
    model = naiveBayesEnsemblePredictiveModel("nb_ensemble")
    return model.validation(X, Y, mapping_sizes,method=2,verbose=False)

In [10]:
nb_cats_score, nb_dogs_score, nb_score = nb_run(cats), nb_run(dogs), nb_run(train)

Tue Mar 12 17:40:53 2019 [population ensemble.__init__] initialized succesfully
Tue Mar 12 17:40:53 2019 [base-gaussianNB.__init__] initialized succesfully
Tue Mar 12 17:40:53 2019 [base-multinomialNB-Gender.__init__] initialized succesfully
Tue Mar 12 17:40:53 2019 [base-multinomialNB-Vaccinated.__init__] initialized succesfully
Tue Mar 12 17:40:53 2019 [base-multinomialNB-Dewormed.__init__] initialized succesfully
Tue Mar 12 17:40:53 2019 [base-multinomialNB-Sterilized.__init__] initialized succesfully
Tue Mar 12 17:40:53 2019 [base-multinomialNB-Breed1.__init__] initialized succesfully
Tue Mar 12 17:40:54 2019 [base-multinomialNB-Breed2.__init__] initialized succesfully
Tue Mar 12 17:40:54 2019 [base-multinomialNB-Color1.__init__] initialized succesfully
Tue Mar 12 17:40:54 2019 [base-multinomialNB-Color2.__init__] initialized succesfully
Tue Mar 12 17:40:54 2019 [base-multinomialNB-Color3.__init__] initialized succesfully
Tue Mar 12 17:40:54 2019 [base-multinomialNB-State.__init__]

Tue Mar 12 17:40:55 2019 [base-multinomialNB-Breed2.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Color1.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Color2.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Color3.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-State.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-gaussianNB.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Gender.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Vaccinated.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Dewormed.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Sterilized.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Breed1.__init__] initialized succesfully
Tue Mar 12 17:40:55 2019 [base-multinomialNB-Breed2.__i

In [11]:
nb_cats_score, nb_dogs_score, nb_score

(0.10249266591170247, 0.07302337571953729, 0.09725712493547019)

<h1>KNN MODEL

In [12]:
from KNN.knnModel import PredictiveModel as knnPredictiveModel

In [13]:
def knn_prepare_data(population):
    """
    this prepare data to be fed inside a KNN
    Args: subset of (rows) of train
    Return: X,Y
    """
    to_drop = ["Unnamed: 0", "dataset_type", "Name", "RescuerID", "Description","PetID"]

    X = population.drop(["AdoptionSpeed"] + to_drop , axis=1)
    if 'Type' in X.columns: X = X.drop('Type', axis=1)
    Y = population['AdoptionSpeed']
    X = X.reset_index().drop('index',axis=1)
    Y = Y.reset_index().drop('index',axis=1)['AdoptionSpeed']
    return X,Y

def knn_run(population):
    """
    this validate KNN on given population
    Args: subset of (rows) of train
    Return: score
    """
    X,Y = knn_prepare_data(population)
    model = knnPredictiveModel("knn", 100)
    return model.validation(X, Y, method=2, verbose=False)

In [14]:
knn_cats_score, knn_dogs_score, knn_score = knn_run(cats),  knn_run(dogs), knn_run(train)

Tue Mar 12 17:41:10 2019 [population.__init__] initialized succesfully
Tue Mar 12 17:41:11 2019 [population.__init__] initialized succesfully
Tue Mar 12 17:41:12 2019 [population.__init__] initialized succesfully


In [15]:
knn_cats_score, knn_dogs_score, knn_score

(0.09096981002714544, 0.1613138501198026, 0.13284790158193424)

<h1>Prediction analysis

In [19]:
split = int(len(train)*0.8)
train_X, train_Y = train[:split],   train[:split]['AdoptionSpeed']
test_X,  test_Y  = train[split+1:], train[split+1:]['AdoptionSpeed']

In [41]:
#knn
X, Y = knn_prepare_data(train_X)
model = knnPredictiveModel("knn_proba", 100)
model.train(X, Y)
test, labels = knn_prepare_data(test_X)
knn_proba = model.predict(test, probability=True)
knn_labels = model.predict(test, probability=False)

Tue Mar 12 17:52:44 2019 [knn_proba.__init__] initialized succesfully


In [42]:
#NB
X, Y = nb_prepare_data(train_X)
model = naiveBayesEnsemblePredictiveModel("nb_proba")
model.train(X, Y,mapping_sizes,verbose=False)
test, labels = nb_prepare_data(test_X)
nb_proba = model.predict(test, probability=True)
nb_labels = model.predict(test, probability=False)

Tue Mar 12 17:52:45 2019 [nb_proba.__init__] initialized succesfully
Tue Mar 12 17:52:45 2019 [base-gaussianNB.__init__] initialized succesfully
Tue Mar 12 17:52:45 2019 [base-multinomialNB-Gender.__init__] initialized succesfully
Tue Mar 12 17:52:45 2019 [base-multinomialNB-Vaccinated.__init__] initialized succesfully
Tue Mar 12 17:52:45 2019 [base-multinomialNB-Dewormed.__init__] initialized succesfully
Tue Mar 12 17:52:45 2019 [base-multinomialNB-Sterilized.__init__] initialized succesfully
Tue Mar 12 17:52:45 2019 [base-multinomialNB-Breed1.__init__] initialized succesfully
Tue Mar 12 17:52:46 2019 [base-multinomialNB-Breed2.__init__] initialized succesfully
Tue Mar 12 17:52:46 2019 [base-multinomialNB-Color1.__init__] initialized succesfully
Tue Mar 12 17:52:46 2019 [base-multinomialNB-Color2.__init__] initialized succesfully
Tue Mar 12 17:52:46 2019 [base-multinomialNB-Color3.__init__] initialized succesfully
Tue Mar 12 17:52:46 2019 [base-multinomialNB-State.__init__] initialize

In [43]:
knn_labels

array([4, 1, 2, ..., 2, 2, 3])

In [44]:
nb_labels

array([2, 2, 2, ..., 4, 4, 2])

In [54]:
np.bincount(nb_labels == knn_labels)

array([1682, 1316])