# EY Challenge 

My plan is to convert all of the categorical variables to numeric variables and see if a regression or other predictions
will help me solve for the species.

In [None]:
# load in data
# Other
import pandas as pd
import matplotlib.pyplot as plt
import os
import gc
import zipfile
from sklearn.model_selection import train_test_split
import sklearn.linear_model
from sklearn.metrics import confusion_matrix
import numpy as np


# Folder to store extracted files
storage_path = './files/output/'

# Path to data folder with provided material
data_path = './'

In [None]:
#taking data from only regions of interest
if not os.path.exists(data_path+'/training_data/'):
    os.mkdir(data_path+'/training_data/')
    with zipfile.ZipFile(data_path+'/GBIF_training_data.zip', 'r') as zip_ref:
        zip_ref.extractall(data_path+'/training_data/')
        
def filter_bbox(frogs, bbox):
    frogs = frogs[lambda x: 
        (x.decimalLongitude >= bbox[0]) &
        (x.decimalLatitude >= bbox[1]) &
        (x.decimalLongitude <= bbox[2]) &
        (x.decimalLatitude <= bbox[3])
    ]
    return frogs

def get_frogs(file, year_range=None, bbox=None):
    """Returns the dataframe of all frog occurrences for the bounding box specified."""
    columns = [
        'gbifID','eventDate','country','continent','stateProvince',
        'decimalLatitude','decimalLongitude','species'
    ]
    country_names = {
        'AU':'Australia', 'CR':'Costa Rica', 'ZA':'South Africa','MX':'Mexico','HN':'Honduras',
        'MZ':'Mozambique','BW':'Botswana','MW':'Malawi','CO':'Colombia','PA':'Panama','NI':'Nicaragua',
        'BZ':'Belize','ZW':'Zimbabwe','SZ':'Eswatini','ZM':'Zambia','GT':'Guatemala','LS':'Lesotho',
        'SV':'El Salvador', 'AO':'Angola', np.nan:'unknown or invalid'
    }
    continent_names = {
        'AU':'Australia', 'CR':'Central America', 'ZA':'Africa','MX':'Central America','HN':'Central America',
        'MZ':'Africa','BW':'Africa','MW':'Africa','CO':'Central America','PA':'Central America',
        'NI':'Central America','BZ':'Central America','ZW':'Africa','SZ':'Africa','ZM':'Africa',
        'GT':'Central America','LS':'Africa','SV':'Central America','AO':'Africa', np.nan:'unknown or invalid' 
    }
    frogs = (
        pd.read_csv(data_path+'./training_data/occurrence.txt', sep='\t', parse_dates=['eventDate'])
        .assign(
            country =  lambda x: x.countryCode.map(country_names),
            continent =  lambda x: x.countryCode.map(continent_names),
            species = lambda x: x.species.str.title()
        )
        [columns]
    )
    if year_range is not None:
        frogs = frogs[lambda x: 
            (x.eventDate.dt.year >= year_range[0]) & 
            (x.eventDate.dt.year <= year_range[1])
        ]
    if bbox is not None:
        frogs = filter_bbox(frogs, bbox)
    return frogs

In [None]:
# creating a dataframe to hold the data
all_frog_data = get_frogs(data_path+'/training_data/occurrence.txt')
all_frog_data.sample(10, random_state=420)
all_frog_data.head()

In [None]:
# see the types of data
all_frog_data.dtypes

In [None]:
# counting any null values
print(all_frog_data.isnull().sum(axis = 0))

In [None]:
# take only the year from the eventDate
all_frog_data['year'] = pd.DatetimeIndex(all_frog_data['eventDate']).year

In [None]:
# Take only Australia, Costa Rica, and South Africa
frog_data = all_frog_data[all_frog_data.country.str.contains('Australia|Costa Rica|South Africa')]

# drop first two columns
frog_data = frog_data.drop(columns = ['gbifID','eventDate'])

In [None]:
frog_data.head(n = 5)

In [None]:
# printing each country
print(frog_data['country'].unique())

After loading in all of the packages and the data we are going to look and see how many values are missing from the dataset.

In [None]:
# counting any null values
print(frog_data.isnull().sum(axis = 0))
print()

We are going to change the categorical data into numeric data for the analysis.

In [None]:
# create numeric features for variables
# country
frog_data['country_numeric'] = 0

for index, value in frog_data.iterrows() :
    
    if frog_data.loc[index, 'country'] == 'Australia':
        frog_data.loc[index, 'country_numeric'] = 1
    
    elif frog_data.loc[index, 'country'] == 'Costa Rica':
        frog_data.loc[index, 'country_numeric'] = 2
    
    elif frog_data.loc[index, 'country'] == 'South Africa':
        frog_data.loc[index, 'country_numeric'] = 3
        
    else :
        frog_data.loc[index, 'country_numeric'] = 0

print(frog_data[['country','country_numeric']].head())

In [None]:
# # printing each continent
# print(frog_data['continent'].unique())

In [None]:
# # create numeric features for variables
# # continent
# frog_data['continent_numeric'] = 0

# for index, value in frog_data.iterrows() :
    
#     if frog_data.loc[index, 'continent'] == 'Australia':
#         frog_data.loc[index, 'continent_numeric'] = 1
    
#     elif frog_data.loc[index, 'continent'] == 'Central America':
#         frog_data.loc[index, 'continent_numeric'] = 2
    
#     elif frog_data.loc[index, 'continent'] == 'Africa':
#         frog_data.loc[index, 'continent_numeric'] = 3
        
#     else :
#         frog_data.loc[index, 'continent_numeric'] = 0

# print(frog_data[['continent','continent_numeric']].head())

Took out the continent for future error avoidance.^

In [None]:
# printing each state/province
print(frog_data['stateProvince'].unique())

In [None]:
# printing each species
print(frog_data['species'].unique())

In [None]:
# create numeric features for variables
# state/province
frog_data['stateProvince_numeric'] = 0

for index, value in frog_data.iterrows() :
    
    if frog_data.loc[index, 'stateProvince'] == 'New South Wales':
        frog_data.loc[index, 'stateProvince_numeric'] = 1
    
    elif frog_data.loc[index, 'stateProvince'] == 'Puntarenas':
        frog_data.loc[index, 'stateProvince_numeric'] = 2
    
    elif frog_data.loc[index, 'stateProvince'] == 'Heredia':
        frog_data.loc[index, 'stateProvince_numeric'] = 3
        
    elif frog_data.loc[index, 'stateProvince'] == 'Queensland':
        frog_data.loc[index, 'stateProvince_numeric'] = 4
           
    elif frog_data.loc[index, 'stateProvince'] == 'Limón':
        frog_data.loc[index, 'stateProvince_numeric'] = 5
           
    elif frog_data.loc[index, 'stateProvince'] == 'South Australia':
        frog_data.loc[index, 'stateProvince_numeric'] = 6
           
    elif frog_data.loc[index, 'stateProvince'] == 'Gauteng':
        frog_data.loc[index, 'stateProvince_numeric'] = 7
        
    elif frog_data.loc[index, 'stateProvince'] == 'Western Cape':
        frog_data.loc[index, 'stateProvince_numeric'] = 8
        
    elif frog_data.loc[index, 'stateProvince'] == 'Tasmania':
        frog_data.loc[index, 'stateProvince_numeric'] = 9
        
    elif frog_data.loc[index, 'stateProvince'] == 'Alajuela':
        frog_data.loc[index, 'stateProvince_numeric'] = 10
        
    elif frog_data.loc[index, 'stateProvince'] == 'KwaZulu-Natal':
        frog_data.loc[index, 'stateProvince_numeric'] = 11
        
    elif frog_data.loc[index, 'stateProvince'] == 'Northern Territory':
        frog_data.loc[index, 'stateProvince_numeric'] = 12
        
    elif frog_data.loc[index, 'stateProvince'] == 'Australian Capital Territory':
        frog_data.loc[index, 'stateProvince_numeric'] = 13
        
    elif frog_data.loc[index, 'stateProvince'] == 'Western Australia':
        frog_data.loc[index, 'stateProvince_numeric'] = 14
        
    elif frog_data.loc[index, 'stateProvince'] == 'Free State':
        frog_data.loc[index, 'stateProvince_numeric'] = 16
        
    elif frog_data.loc[index, 'stateProvince'] == 'Limon Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 17
        
    elif frog_data.loc[index, 'stateProvince'] == 'Cape Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 18
        
    elif frog_data.loc[index, 'stateProvince'] == 'Free State Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 19
        
    elif frog_data.loc[index, 'stateProvince'] == 'KwaZulu-Natal Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 20
        
    elif frog_data.loc[index, 'stateProvince'] == 'Limon':
        frog_data.loc[index, 'stateProvince_numeric'] = 21
        
    elif frog_data.loc[index, 'stateProvince'] == 'Limpopo Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 22
        
    elif frog_data.loc[index, 'stateProvince'] == 'Limpopo':
        frog_data.loc[index, 'stateProvince_numeric'] = 23
        
    elif frog_data.loc[index, 'stateProvince'] == 'Eastern Cape':
        frog_data.loc[index, 'stateProvince_numeric'] = 24
        
    elif frog_data.loc[index, 'stateProvince'] == 'Mpumalanga':
        frog_data.loc[index, 'stateProvince_numeric'] = 25
        
    elif frog_data.loc[index, 'stateProvince'] == 'Cartago':
        frog_data.loc[index, 'stateProvince_numeric'] = 26
        
    elif frog_data.loc[index, 'stateProvince'] == 'San José':
        frog_data.loc[index, 'stateProvince_numeric'] = 27
        
    elif frog_data.loc[index, 'stateProvince'] == 'Guanacaste':
        frog_data.loc[index, 'stateProvince_numeric'] = 28
        
    elif frog_data.loc[index, 'stateProvince'] == 'North West':
        frog_data.loc[index, 'stateProvince_numeric'] = 29
        
    elif frog_data.loc[index, 'stateProvince'] == 'Northern Cape':
        frog_data.loc[index, 'stateProvince_numeric'] = 30
        
    elif frog_data.loc[index, 'stateProvince'] == 'San Jose':
        frog_data.loc[index, 'stateProvince_numeric'] = 31
        
    elif frog_data.loc[index, 'stateProvince'] == 'Capitol Territory':
        frog_data.loc[index, 'stateProvince_numeric'] = 32
        
    elif frog_data.loc[index, 'stateProvince'] == 'Cartago Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 33
        
    elif frog_data.loc[index, 'stateProvince'] == 'Puntarenas Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 34
        
    elif frog_data.loc[index, 'stateProvince'] == 'Eastern Cape Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 35
        
    elif frog_data.loc[index, 'stateProvince'] == 'Transvaal':
        frog_data.loc[index, 'stateProvince_numeric'] = 36
        
    elif frog_data.loc[index, 'stateProvince'] == 'Cape':
        frog_data.loc[index, 'stateProvince_numeric'] = 37
        
    elif frog_data.loc[index, 'stateProvince'] == 'Jervis Bay Territory':
        frog_data.loc[index, 'stateProvince_numeric'] = 38
        
    elif frog_data.loc[index, 'stateProvince'] == 'Gauteng Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 39
        
    elif frog_data.loc[index, 'stateProvince'] == 'Alajuela Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 40
        
    elif frog_data.loc[index, 'stateProvince'] == 'Heredia Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 41
        
    elif frog_data.loc[index, 'stateProvince'] == 'Transvaal Province':
        frog_data.loc[index, 'stateProvince_numeric'] = 42
        
    elif frog_data.loc[index, 'stateProvince'] == 'New south wales':
        frog_data.loc[index, 'stateProvince_numeric'] = 43
        
    elif frog_data.loc[index, 'stateProvince'] == 'Western australia':
        frog_data.loc[index, 'stateProvince_numeric'] = 44
        
    elif frog_data.loc[index, 'stateProvince'] == 'Provincia de Puntarenas':
        frog_data.loc[index, 'stateProvince_numeric'] = 45
        
    elif frog_data.loc[index, 'stateProvince'] == 'Cape Prov.':
        frog_data.loc[index, 'stateProvince_numeric'] = 46
        
    elif frog_data.loc[index, 'stateProvince'] == 'Natal Prov.':
        frog_data.loc[index, 'stateProvince_numeric'] = 47
        
    elif frog_data.loc[index, 'stateProvince'] == 'Limon Prov.':
        frog_data.loc[index, 'stateProvince_numeric'] = 48
                
    elif frog_data.loc[index, 'stateProvince'] == 'Transvaal Prov.':
        frog_data.loc[index, 'stateProvince_numeric'] = 49
        
    else :
        frog_data.loc[index, 'stateProvince_numeric'] = 0

print(frog_data[['stateProvince','stateProvince_numeric']].head())


In [None]:
# create numeric features for variables
# species
frog_data['species_numeric'] = 0

for index, value in frog_data.iterrows() :
    
    if frog_data.loc[index, 'species'] == 'Litoria Fallax':
        frog_data.loc[index, 'species_numeric'] = 1
    
    elif frog_data.loc[index, 'species'] == 'Agalychnis Callidryas':
        frog_data.loc[index, 'species_numeric'] = 2
    
    elif frog_data.loc[index, 'species'] == 'Dendrobates Auratus':
        frog_data.loc[index, 'species_numeric'] = 3
        
    elif frog_data.loc[index, 'species'] == 'Crinia Signifera':
        frog_data.loc[index, 'species_numeric'] = 4
        
    elif frog_data.loc[index, 'species'] == 'Xenopus Laevis':
        frog_data.loc[index, 'species_numeric'] = 5
        
    elif frog_data.loc[index, 'species'] == 'Chiromantis Xerampelina':
        frog_data.loc[index, 'species_numeric'] = 6
        
    elif frog_data.loc[index, 'species'] == 'Ranoidea Australis':
        frog_data.loc[index, 'species_numeric'] = 7
        
    elif frog_data.loc[index, 'species'] == 'Austrochaperina Pluvialis':
        frog_data.loc[index, 'species_numeric'] = 8
        
    elif frog_data.loc[index, 'species'] == 'Crinia Glauerti':
        frog_data.loc[index, 'species_numeric'] = 9
        
    else :
        frog_data.loc[index, 'species_numeric'] = 0

print(frog_data[['species','species_numeric']].head())

In [None]:
# counting how many year values are missing
# all_frog_data['year'].isna().sum()

In [None]:
# filling the missing values from year variable
# all_frog_data['year'].fillna(all_frog_data['year'].mode()[0], inplace=True)

# all_frog_data['year'].isna().sum()

In [None]:
# see numerical variables
frog_data[['decimalLatitude', 'decimalLongitude','country_numeric',   # 'continent_numeric' *removed*
         'stateProvince_numeric', 'species_numeric']].head()

After changing the categorical variables into numeric variables we are making our own data frame with just the numeric values.

In [None]:
# seperate df into only numeric values
frog_data_numeric = frog_data[['decimalLatitude', 'decimalLongitude','country_numeric', # 'continent_numeric' *removed*
                               'stateProvince_numeric', 'species_numeric']]
frog_data_numeric.head()

In [None]:
# see the types of data
frog_data_numeric.dtypes

In [None]:
# #changing year column to integer for analysis purposes
# frog_data_numeric['year'] = frog_data_numeric['year'].values.astype(np.int64)

# # see the types of data
# frog_data_numeric.dtypes

Next, we are getting the data set up as X and Y variables and creating training and testing sets.

In [None]:
# creating predictor and response variables
x = frog_data_numeric.drop(['species_numeric'], axis = 1)

y = frog_data_numeric.loc[:, 'species_numeric']

In [None]:
# split data into training and testing
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size = .25)


In [None]:
# running a logistic regression
model_LR = 'Logistic Regression'

# start model
logreg = sklearn.linear_model.LogisticRegression()

# fitting the model to the training data
logreg_fit = logreg.fit(x_train, y_train)

# predicting the outputs
logreg_predict = logreg.predict(x_test)

# score the results using r-squared
logreg_train_score = logreg.score(x_train, y_train).round(4) 
logreg_test_score  = logreg.score(x_test, y_test).round(4)   
logreg_gap         = abs(logreg_train_score - logreg_test_score).round(4)

# displaying results
print('Model Name     :', model_LR)
print('Training Score :', logreg_train_score)
print('Testing Score  :', logreg_test_score)
print('Train-Test Gap :', logreg_gap)

In [None]:
# plotting a confusion matric of the results
conf_matrix = confusion_matrix(y_test, logreg_predict)

print(conf_matrix)

In [None]:
# create a heatmap for better visuals
import seaborn as sns

# make the figure size bigger
fig, ax = plt.subplots(figsize=(15,10))

ax = sns.heatmap(conf_matrix, annot=True, cmap='Blues',
                 linecolor  = 'black', linewidths = 0.5)

ax.set_title('Frogs Confusion Matrix\n');
ax.set_xlabel('\nPredicted Values')
ax.set_ylabel('Actual Values ');

## Display the visualization of the Confusion Matrix.
plt.show()

After trying to predict the species with regression I am going to try a KNN classifier model to see if it is more accurate.

In [None]:
# scaling the data
# import packages
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans

# create scaler and pipepline on logreg
scaler = StandardScaler()
pipeline = make_pipeline(scaler, logreg)

# fit and score the scaled data
pipeline.fit(x_train, y_train)
score = pipeline.score(x_test, y_test)
score.round(decimals = 4)

In [None]:
# load packages for KNN classifier 
from sklearn.neighbors import KNeighborsClassifier

# create the KNN classifier
knn = KNeighborsClassifier(n_neighbors = 20)

# fit the classifier to the training data
knn.fit(x_train, y_train)

# print accuracy 
print(knn.score(x_test, y_test).round(decimals = 4))

In [None]:
# try a loop over K classifiers
# set arrays
neighbors = np.arange(1,25)
train_accuracy = np.empty(len(neighbors))
test_accuracy = np.empty(len(neighbors))

# creat a loop
for i,k in enumerate(neighbors):
    knn = KNeighborsClassifier(n_neighbors = k)
    knn.fit(x_train, y_train)
    ypredictions = knn.predict(x_test)
    train_accuracy[i] = knn.score(x_train, y_train)
    test_accuracy[i] = knn.score(x_test, y_test)

In [None]:
print((sum(test_accuracy)/len(test_accuracy)).round(decimals = 4))

KNN classifiers seems to give the largest test accuracy.

In [None]:
from sklearn.ensemble import RandomForestClassifier
%matplotlib inline

model = RandomForestClassifier()
model.fit(x, y)

(pd.Series(model.feature_importances_, index=x.columns)
   .nlargest(4)
   .plot(kind='barh'))

After finding the model that we want to use, we are converting the numeric values back to categorical values for mapping purposes.

In [None]:
frog_data_categorical = pd.DataFrame()

In [None]:
# <-- the data frame is created to store the 
# categorical variables once converted

In [None]:
# convert numeric back to categorical
# country
frog_data_numeric['country_categorical'] = 0

for index, value in frog_data_numeric.iterrows() :
    
    if frog_data_numeric.loc[index, 'country_numeric'] == 1:
        frog_data_numeric.loc[index, 'country_categorical'] = 'Australia'
    
    elif frog_data_numeric.loc[index, 'country_numeric'] == 2 :
        frog_data_numeric.loc[index, 'country_categorical'] = 'Costa Rica'
    
    elif frog_data_numeric.loc[index, 'country_numeric'] == 3:
        frog_data_numeric.loc[index, 'country_categorical'] = 'South Africa'
        
    else :
        frog_data_numeric.loc[index, 'country_categorical'] = 0

print(frog_data_numeric[['country_categorical','country_numeric']].head())

In [None]:
# create categorical features for variables
# continent
# frog_data_numeric['continent_categorical'] = 0

# for index, value in frog_data_numeric.iterrows() :
    
#     if frog_data_numeric.loc[index, 'continent_numeric'] == 1:
#         frog_data_numeric.loc[index, 'continent_categorical'] = 'Australia'
    
#     elif frog_data_numeric.loc[index, 'continent_numeric'] == 2:
#         frog_data_numeric.loc[index, 'continent_categorical'] = 'Central America'
    
#     elif frog_data_numeric.loc[index, 'continent_numeric'] == 3:
#         frog_data_numeric.loc[index, 'continent_categorical'] = 'Africa'
        
#     else :
#         frog_data_numeric.loc[index, 'continent_categorical'] = 0

# print(frog_data_numeric[['continent_numeric','continent_categorical']].head())

                             Removed the translation of continent to avoid future errors with testing dataset.^

In [None]:
# create categroical features for variables
# state/province
frog_data_numeric['stateProvince_categorical'] = 0

for index, value in frog_data_numeric.iterrows() :
    
    if frog_data_numeric.loc[index, 'stateProvince_numeric'] == 1:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'New South Wales'
    
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 2:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Puntarenas'
    
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 3:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Heredia'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 4:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Queensland'
           
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 5:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Limón'
           
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 6:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'South Australia'
           
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 7:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Gauteng'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 8:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Western Cape'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 9:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Tasmania'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 10:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Alajuela'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 11:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'KwaZulu-Natal'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 12:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Northern Territory'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 13:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Australian Capital Territory'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 14:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Western Australia'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 16:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Free State'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 17:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Limon Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 18:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Cape Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 19:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Free State Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 20:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'KwaZulu-Natal Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 21:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Limon'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 22:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Limpopo Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 23:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Limpopo'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 24:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Eastern Cape'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 25:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Mpumalanga'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 26:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Cartago'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 27:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'San José'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 28:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Guanacaste'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 29:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'North West'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 30:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Northern Cape'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 31:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'San Jose'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 32:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Capitol Territory'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 33:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Cartago Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 34:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Puntarenas Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 35:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Eastern Cape Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 36:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Transvaal'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 37:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Cape'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 38:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Jervis Bay Territory'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 39:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Gauteng Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 40:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Alajuela Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 41:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Heredia Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 42:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Transvaal Province'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 43:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'New south wales'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 44:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Western australia'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 45:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Provincia de Puntarenas'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 46:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Cape Prov.'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 47:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Natal Prov.'
        
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 48:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Limon Prov.'
                
    elif frog_data_numeric.loc[index, 'stateProvince_numeric'] == 49:
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 'Transvaal Prov.'
        
    else :
        frog_data_numeric.loc[index, 'stateProvince_categorical'] = 0

print(frog_data_numeric[['stateProvince_categorical','stateProvince_numeric']].head())

In [None]:
# create categorical features for variables
# species
frog_data_numeric['species_categorical'] = 0

for index, value in frog_data_numeric.iterrows() :
    
    if frog_data_numeric.loc[index, 'species_numeric'] == 1:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Litoria Fallax'
    
    elif frog_data_numeric.loc[index, 'species_numeric'] == 2:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Agalychnis Callidryas'
    
    elif frog_data_numeric.loc[index, 'species_numeric'] == 3:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Dendrobates Auratus'
        
    elif frog_data_numeric.loc[index, 'species_numeric'] == 4:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Crinia Signifera'
        
    elif frog_data_numeric.loc[index, 'species_numeric'] == 5:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Xenopus Laevis'
        
    elif frog_data_numeric.loc[index, 'species_numeric'] == 6:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Chiromantis Xerampelina'
        
    elif frog_data_numeric.loc[index, 'species_numeric'] == 7:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Ranoidea Australis'
        
    elif frog_data_numeric.loc[index, 'species_numeric'] == 8:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Austrochaperina Pluvialis'
        
    elif frog_data_numeric.loc[index, 'species_numeric'] == 9:
        frog_data_numeric.loc[index, 'species_categorical'] = 'Crinia Glauerti'
        
    else :
        frog_data_numeric.loc[index, 'species_categorical'] = 0

print(frog_data_numeric[['species_categorical','species_numeric']].head())

In [None]:
# seperate df into only categorical values
frog_data_categorical = frog_data_numeric[['decimalLatitude', 'decimalLongitude','country_categorical', # 'continent_categorical' *removed*
         'stateProvince_categorical', 'species_categorical']]
frog_data_categorical.head()

# Building Maps

In [None]:
frog_data_categorical.to_excel("Categorical_Data_Challenge.xlsx",sheet_name = 'categorical data')

In [None]:
# set up data frame with only Australia data
Australia = pd.DataFrame() # setting up the data frame

# copying columns for only Australia
Australia = frog_data_categorical.loc[frog_data_categorical['country_categorical'].str.startswith('Australia')]

In [None]:
# set up dataframe with only South Africa Data
South_Africa = pd.DataFrame() # setting up the data frame

# copying columns for only Afirca
South_Africa = frog_data_categorical.loc[frog_data_categorical['country_categorical'].str.startswith('South Africa')]

In [None]:
# set up dataframe with only Costa Rica data
Costa_Rica = pd.DataFrame() # setting up the data frame

# copying columns for only Afirca
Costa_Rica = frog_data_categorical.loc[frog_data_categorical['country_categorical'].str.startswith('Costa Rica')]

In [None]:
# installing needing package
pip install plotly # may need to run in command line to avoid invlaid syntax error

In [None]:
# importing needed packages
import pandas as pd
import plotly.express as px

# creating a map of Australia with the species shown at the given coordinates
fig = px.scatter_mapbox(
    Australia,  #adding our data frame
    lat = Australia.decimalLatitude, # pulling latitude and longitude
    lon = Australia.decimalLongitude,
    center = {"lat": -26, "lon": 134},  # where map will be centered
    width = 800,  # Width of map
    height = 600,  # Height of map
    hover_data = ["species_categorical"],  # what to display when hovering over map
    color = Australia['species_categorical'], # colors of the dots in the map
    zoom = 3 # how zoomed in the map is
)

fig.update_layout(mapbox_style="open-street-map") # adding beautiful street layout to map

fig.show()

In [None]:
# creating a map of Australia with the species shown at the given coordinates
fig = px.scatter_mapbox(
    South_Africa,  #adding our data frame
    lat = South_Africa.decimalLatitude, # pulling latitude and longitude
    lon = South_Africa.decimalLongitude,
    center = {"lat": -29, "lon": 25},  # where map will be centered
    width = 800,  # Width of map
    height = 600,  # Height of map
    hover_data = ["species_categorical"],  # what to display when hovering over map
    color = South_Africa['species_categorical'], # colors of the dots in the map
    zoom = 4 # how zoomed in the map is
)

fig.update_layout(mapbox_style="open-street-map") # adding beautiful street layout to map

fig.show()

In [None]:
# creating a map of Australia with the species shown at the given coordinates
fig = px.scatter_mapbox(
    Costa_Rica,  #adding our data frame
    lat = Costa_Rica.decimalLatitude, # pulling latitude and longitude
    lon = Costa_Rica.decimalLongitude,
    center = {"lat": 10, "lon": -84},  # where map will be centered
    width = 800,  # Width of map
    height = 600,  # Height of map
    hover_data = ["species_categorical"],  # what to display when hovering over map
    color = Costa_Rica['species_categorical'], # colors of the dots in the map
    zoom = 6 # how zoomed in the map is
)

fig.update_layout(mapbox_style="open-street-map") # adding beautiful street layout to map

fig.show()

# Preparing Submission

First, I have to read in the testing data and add state & country data into the dataset to be able to predict using KNN model. 

In [None]:
test_template = pd.read_csv('Level_2_challenge_2_submission_template.csv')
test_template['index'] = range(1, len(test_template) + 1)
test_template.head()

In [None]:
# pip install geopy
from geopy.geocoders import Nominatim

In [None]:
# initialize Nominatim API 
geolocator = Nominatim(user_agent="geoapiTest_DataChallenge")

In [None]:
def city_state_country(row):
    coord = f"{row['decimalLatitude']}, {row['decimalLongitude']}"
    location = geolocator.reverse(coord, exactly_one=True)
    address = location.raw['address']
    city = address.get('city', '')
    state = address.get('state', '')
    country = address.get('country', '')
    row['city'] = city
    row['state'] = state
    row['country'] = country
    return row

test_template = test_template.apply(city_state_country, axis=1)
test_template.head()

In [None]:
# create numeric features for variables
# country
test_template['country_numeric'] = 0

for index, value in test_template.iterrows() :
    
    if test_template.loc[index, 'country'] == 'Australia':
        test_template.loc[index, 'country_numeric'] = 1
    
    elif test_template.loc[index, 'country'] == 'Costa Rica':
        test_template.loc[index, 'country_numeric'] = 2
    
    elif test_template.loc[index, 'country'] == 'South Africa':
        test_template.loc[index, 'country_numeric'] = 3
        
    else :
        test_template.loc[index, 'country_numeric'] = 0

print(test_template[['country','country_numeric']].head())

In [None]:
# printing each country
print(test_template['country'].unique())

In [None]:
# create numeric features for variables
# state/province
test_template['stateProvince_numeric'] = 0

for index, value in test_template.iterrows() :
    
    if test_template.loc[index, 'state'] == 'New South Wales':
        test_template.loc[index, 'stateProvince_numeric'] = 1
    
    elif test_template.loc[index, 'state'] == 'Puntarenas':
        test_template.loc[index, 'stateProvince_numeric'] = 2
    
    elif test_template.loc[index, 'state'] == 'Heredia':
        test_template.loc[index, 'stateProvince_numeric'] = 3
        
    elif test_template.loc[index, 'state'] == 'Queensland':
        test_template.loc[index, 'stateProvince_numeric'] = 4
           
    elif test_template.loc[index, 'state'] == 'Limón':
        test_template.loc[index, 'stateProvince_numeric'] = 5
           
    elif test_template.loc[index, 'state'] == 'South Australia':
        test_template.loc[index, 'stateProvince_numeric'] = 6
           
    elif test_template.loc[index, 'state'] == 'Gauteng':
        test_template.loc[index, 'stateProvince_numeric'] = 7
        
    elif test_template.loc[index, 'state'] == 'Western Cape':
        test_template.loc[index, 'stateProvince_numeric'] = 8
        
    elif test_template.loc[index, 'state'] == 'Tasmania':
        test_template.loc[index, 'stateProvince_numeric'] = 9
        
    elif test_template.loc[index, 'state'] == 'Alajuela':
        test_template.loc[index, 'stateProvince_numeric'] = 10
        
    elif test_template.loc[index, 'state'] == 'KwaZulu-Natal':
        test_template.loc[index, 'stateProvince_numeric'] = 11
        
    elif test_template.loc[index, 'state'] == 'Northern Territory':
        test_template.loc[index, 'stateProvince_numeric'] = 12
        
    elif test_template.loc[index, 'state'] == 'Australian Capital Territory':
        test_template.loc[index, 'stateProvince_numeric'] = 13
        
    elif test_template.loc[index, 'state'] == 'Western Australia':
        test_template.loc[index, 'stateProvince_numeric'] = 14
        
    elif test_template.loc[index, 'state'] == 'Free State':
        test_template.loc[index, 'stateProvince_numeric'] = 16
        
    elif test_template.loc[index, 'state'] == 'Limon Province':
        test_template.loc[index, 'stateProvince_numeric'] = 17
        
    elif test_template.loc[index, 'state'] == 'Cape Province':
        test_template.loc[index, 'stateProvince_numeric'] = 18
        
    elif test_template.loc[index, 'state'] == 'Free State Province':
        test_template.loc[index, 'stateProvince_numeric'] = 19
        
    elif test_template.loc[index, 'state'] == 'KwaZulu-Natal Province':
        test_template.loc[index, 'stateProvince_numeric'] = 20
        
    elif test_template.loc[index, 'state'] == 'Limon':
        test_template.loc[index, 'stateProvince_numeric'] = 21
        
    elif test_template.loc[index, 'state'] == 'Limpopo Province':
        test_template.loc[index, 'stateProvince_numeric'] = 22
        
    elif test_template.loc[index, 'state'] == 'Limpopo':
        test_template.loc[index, 'stateProvince_numeric'] = 23
        
    elif test_template.loc[index, 'state'] == 'Eastern Cape':
        test_template.loc[index, 'stateProvince_numeric'] = 24
        
    elif test_template.loc[index, 'state'] == 'Mpumalanga':
        test_template.loc[index, 'stateProvince_numeric'] = 25
        
    elif test_template.loc[index, 'state'] == 'Cartago':
        test_template.loc[index, 'stateProvince_numeric'] = 26
        
    elif test_template.loc[index, 'state'] == 'San José':
        frog_data.loc[index, 'stateProvince_numeric'] = 27
        
    elif test_template.loc[index, 'state'] == 'Guanacaste':
        test_template.loc[index, 'stateProvince_numeric'] = 28
        
    elif test_template.loc[index, 'state'] == 'North West':
        test_template.loc[index, 'stateProvince_numeric'] = 29
        
    elif test_template.loc[index, 'state'] == 'Northern Cape':
        test_template.loc[index, 'stateProvince_numeric'] = 30
        
    elif test_template.loc[index, 'state'] == 'San Jose':
        test_template.loc[index, 'stateProvince_numeric'] = 31
        
    elif test_template.loc[index, 'state'] == 'Capitol Territory':
        test_template.loc[index, 'stateProvince_numeric'] = 32
        
    elif test_template.loc[index, 'state'] == 'Cartago Province':
        test_template.loc[index, 'stateProvince_numeric'] = 33
        
    elif test_template.loc[index, 'state'] == 'Puntarenas Province':
        test_template.loc[index, 'stateProvince_numeric'] = 34
        
    elif test_template.loc[index, 'state'] == 'Eastern Cape Province':
        test_template.loc[index, 'stateProvince_numeric'] = 35
        
    elif test_template.loc[index, 'state'] == 'Transvaal':
        test_template.loc[index, 'stateProvince_numeric'] = 36
        
    elif test_template.loc[index, 'state'] == 'Cape':
        test_template.loc[index, 'stateProvince_numeric'] = 37
        
    elif test_template.loc[index, 'state'] == 'Jervis Bay Territory':
        test_template.loc[index, 'stateProvince_numeric'] = 38
        
    elif test_template.loc[index, 'state'] == 'Gauteng Province':
        test_template.loc[index, 'stateProvince_numeric'] = 39
        
    elif test_template.loc[index, 'state'] == 'Alajuela Province':
        test_template.loc[index, 'stateProvince_numeric'] = 40
        
    elif test_template.loc[index, 'state'] == 'Heredia Province':
        test_template.loc[index, 'stateProvince_numeric'] = 41
        
    elif test_template.loc[index, 'state'] == 'Transvaal Province':
        test_template.loc[index, 'stateProvince_numeric'] = 42
        
    elif test_template.loc[index, 'state'] == 'New south wales':
        test_template.loc[index, 'stateProvince_numeric'] = 43
        
    elif test_template.loc[index, 'state'] == 'Western australia':
        test_template.loc[index, 'stateProvince_numeric'] = 44
        
    elif test_template.loc[index, 'state'] == 'Provincia de Puntarenas':
        test_template.loc[index, 'stateProvince_numeric'] = 45
        
    elif test_template.loc[index, 'state'] == 'Cape Prov.':
        test_template.loc[index, 'stateProvince_numeric'] = 46
        
    elif test_template.loc[index, 'state'] == 'Natal Prov.':
        test_template.loc[index, 'stateProvince_numeric'] = 47
        
    elif test_template.loc[index, 'state'] == 'Limon Prov.':
        test_template.loc[index, 'stateProvince_numeric'] = 48
                
    elif test_template.loc[index, 'state'] == 'Transvaal Prov.':
        test_template.loc[index, 'stateProvince_numeric'] = 49
        
    else :
        test_template.loc[index, 'stateProvince_numeric'] = 0

print(test_template[['state','stateProvince_numeric']].head())


In [None]:
# printing each state
print(test_template['state'].unique())

In [None]:
# split the template into X and Y variables
sub_x = test_template[['decimalLatitude','decimalLongitude', 'stateProvince_numeric','country_numeric']]

sub_y = test_template[['Agalychnis Callidryas','Austrochaperina Pluvialis','Chiromantis Xerampelina','Crinia Glauerti',
                      'Crinia Signifera','Cyclorana Australis','Dendrobates Auratus','Litoria Fallax','Xenopus Laevis']]

In [None]:
test_predictions = knn.predict(sub_x)
test_template['numeric_predictions'] = test_predictions

In [None]:
# printing each numeric predictions
print(test_template['numeric_predictions'].unique())

In [None]:
# lots of binary converters
# create categorical features for variables
# species
test_template['species_categorical'] = 0

for index, value in test_template.iterrows() :
    
    if test_template.loc[index, 'numeric_predictions'] == 1:
        test_template.loc[index, 'species_categorical'] = 'Litoria Fallax'
    
    elif test_template.loc[index, 'numeric_predictions'] == 2:
        test_template.loc[index, 'species_categorical'] = 'Agalychnis Callidryas'
    
    elif test_template.loc[index, 'numeric_predictions'] == 3:
        test_template.loc[index, 'species_categorical'] = 'Dendrobates Auratus'
        
    elif test_template.loc[index, 'numeric_predictions'] == 4:
        test_template.loc[index, 'species_categorical'] = 'Crinia Signifera'
        
    elif test_template.loc[index, 'numeric_predictions'] == 5:
        test_template.loc[index, 'species_categorical'] = 'Xenopus Laevis'
        
    elif test_template.loc[index, 'numeric_predictions'] == 6:
        test_template.loc[index, 'species_categorical'] = 'Chiromantis Xerampelina'
        
    elif test_template.loc[index, 'numeric_predictions'] == 7:
        test_template.loc[index, 'species_categorical'] = 'Cyclorana Australis'
        
    elif test_template.loc[index, 'numeric_predictions'] == 8:
        test_template.loc[index, 'species_categorical'] = 'Austrochaperina Pluvialis'
        
    elif test_template.loc[index, 'numeric_predictions'] == 9:
        test_template.loc[index, 'species_categorical'] = 'Crinia Glauerti'
        
    else :
        test_template.loc[index, 'species_categorical'] = 0

print(test_template[['species_categorical','numeric_predictions']].head())

In [None]:
test_template.head()

In [None]:
# species to binary operators
for index, value in test_template.iterrows() :
    
    if test_template.loc[index, 'species_categorical'] == 'Litoria Fallax':
        test_template.loc[index, 'Litoria Fallax'] = 1
    
    elif test_template.loc[index, 'species_categorical'] == 'Agalychnis Callidryas':
        test_template.loc[index, 'Agalychnis Callidryas'] = 1
    
    elif test_template.loc[index, 'species_categorical'] == 'Dendrobates Auratus':
        test_template.loc[index, 'Dendrobates Auratus'] = 1
        
    elif test_template.loc[index, 'species_categorical'] == 'Crinia Signifera':
        test_template.loc[index, 'Crinia Signifera'] = 1
        
    elif test_template.loc[index, 'species_categorical'] == 'Xenopus Laevis':
        test_template.loc[index, 'Xenopus Laevis'] = 1
        
    elif test_template.loc[index, 'species_categorical'] == 'Chiromantis Xerampelina':
        test_template.loc[index, 'Chiromantis Xerampelina'] =1
        
    elif test_template.loc[index, 'species_categorical'] == 'Cyclorana Australis':
        test_template.loc[index, 'Cyclorana Australis'] = 1
        
    elif test_template.loc[index, 'species_categorical'] == 'Austrochaperina Pluvialis':
        test_template.loc[index, 'Austrochaperina Pluvialis'] = 1
        
    elif test_template.loc[index, 'species_categorical'] == 'Crinia Glauerti':
        test_template.loc[index, 'Crinia Glauerti'] = 1
        
    else :
        break

test_template.head(n = 5)

In [None]:
test_template2 = test_template[['id','decimalLatitude','decimalLongitude','Agalychnis Callidryas',
                                'Austrochaperina Pluvialis','Chiromantis Xerampelina','Crinia Glauerti',
                                'Crinia Signifera','Cyclorana Australis','Dendrobates Auratus','Litoria Fallax',
                                'Xenopus Laevis']].copy()

In [None]:
test_template2.to_csv('Swearingen_Taylor_Level_2_submission.csv', index=None)

# Submission File Completed!

In [None]:
pip freeze > requirements.txt 

In [None]:
import pickle
pickle.dump(knn, open('model.pkl','wb'))