# Creating the training and test sets from previous research

This file creates train and test data from the master data files. These should be the same data used in the original research.

In [1]:
import pandas as pd
from pathlib import Path

In [2]:
datadir = Path("../sdm-asian-elephants/data/")
elephasf = datadir / "Elephas_Maximus"

etccdi_presentf = datadir / "Climate"/ "ISEA3H09_CCSM4_Y1950_Y2000_ETCCDI_IDW1N10.txt"
etccdi_futuref = datadir / "Climate"/ "ISEA3H09_CCSM4_Y2061_Y2080_ETCCDI_IDW1N10.txt"
wc_presentf = datadir / "Climate"/ "ISEA3H09_WC30AS_V14_BIO.txt"
wc_futuref = datadir / "Climate"/ "ISEA3H09_WC30AS_V14_CMIP5_CCSM4_RCP85_2070_BIO.txt"

In [3]:
# read the data
etccdi_present = pd.read_csv(etccdi_presentf, sep="\t")
etccdi_future = pd.read_csv(etccdi_futuref, sep="\t")
wc_present = pd.read_csv(wc_presentf, sep="\t")
wc_future = pd.read_csv(wc_futuref, sep="\t")

labels = pd.read_csv(elephasf/"Elephas_Maximus_PA_Natural_O20.txt", sep="\t")
testarea = pd.read_csv(elephasf/"Elephas_Maximus_Predictions_GLM_Y1950_Y2000.txt", sep="\t")

In [4]:
# variables that need conversion i.e. need to be divided by 10.
need_to_convert = ["BIO01_Mean", "BIO02_Mean", "BIO08_Mean", "BIO09_Mean", "BIO10_Mean", "BIO11_Mean",
                   "BIO04_Mean", "BIO05_Mean", "BIO06_Mean", "BIO07_Mean"]

# convert the degrees
wc_present[need_to_convert] = wc_present[need_to_convert]/10
wc_future[need_to_convert] = wc_future[need_to_convert]/10

In [5]:
# join the training data
traindata = pd.merge(etccdi_present, wc_present, on="HID")

In [6]:
# only select the necessary rows
traindata = traindata[traindata["HID"].isin(list(labels.HID))]

In [7]:
# join the training data with Elephas presence labels
traindata = pd.merge(traindata, labels, on="HID")

In [8]:
# checking if out of scope area of wc variables exist in training data - no
traindata[(traindata==-1000.000000).any(axis=1)]

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean,PA


In [9]:
traindata.sample(5)

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean,PA
718,62007,149.263874,8.861205,4.146406,13.622922,8.16091,363.117868,0.001306,92.70889,1.826428,...,12.393269,195.860758,38.131288,0.0,78.871708,88.968206,0.546254,0.546254,74.988093,0
4753,79544,22.747456,1.46136,35.818103,11.483061,152.980035,228.17466,19.847755,2402.697122,73.46635,...,0.888997,985.940163,202.186719,6.794101,83.58458,535.597097,26.963292,524.112467,26.963292,0
4645,78467,48.655568,7.919767,46.345906,7.348333,0.0,365.0,0.0,1705.340289,53.499299,...,25.229646,1254.772864,271.847081,6.584963,80.427483,638.310916,29.965706,249.592737,66.654477,1
1719,65737,112.172535,9.691667,4.864167,11.124041,7.279107,361.363506,0.121176,181.152155,4.56379,...,15.367135,126.102925,31.65949,0.067865,103.582885,83.17798,2.439195,2.439195,83.069418,1
3372,71839,69.611347,7.146021,7.008987,11.105138,167.190816,203.874005,65.598217,240.382724,2.695085,...,-4.179729,66.285126,14.932937,0.116391,81.725111,38.732728,1.601172,38.732728,2.830109,0


In [10]:
# select the variables used in the models of the previous research
train_GLM = traindata[["HID", "BIO03_Mean", "TN10P_IDW1N10","GSL_IDW1N10","TNX_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10", "PA"]]
train_SGLM = traindata[["HID", "BIO08_Mean", "TXX_IDW1N10", "BIO02_Mean", "TN90P_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10", "PA"]]

In [25]:
train_SGLM.describe()

Unnamed: 0,HID,BIO08_Mean,TXX_IDW1N10,BIO02_Mean,TN90P_IDW1N10,ID_IDW1N10,BIO14_Mean,BIO18_Mean,CWD_IDW1N10,RX1DAY_IDW1N10,WSDI_IDW1N10,PA
count,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0
mean,87185.705088,20.040758,38.769448,11.783774,11.052974,16.931569,17.404982,284.159281,20.188185,53.494516,6.963563,0.513573
std,34406.803105,7.850105,5.732023,2.653852,0.566438,34.132812,38.80665,312.837901,16.826197,29.050981,3.061232,0.49985
min,48763.0,-9.540702,22.829643,5.241565,9.092302,0.0,0.0,0.0,2.458859,7.346992,0.768632,0.0
25%,66229.5,13.891883,34.375288,9.611574,10.66186,0.0,0.075074,13.953416,6.420884,24.46215,4.452838,0.0
50%,72695.0,22.399398,38.224824,11.866559,11.031791,0.134977,2.806513,176.938485,12.327726,60.304804,6.794981,1.0
75%,83199.5,26.307707,44.085135,13.871231,11.45737,11.781081,12.773083,482.204501,33.526784,76.01288,9.467519,1.0
max,162870.0,36.092831,51.419834,18.18636,12.777314,158.660532,278.746428,3020.802083,92.68243,177.918199,15.2352,1.0


In [12]:
# write out the training data
#train_GLM.to_csv("traindata_GLM.csv", index=False)
#train_SGLM.to_csv("traindata_SGLM.csv", index=False)

In [13]:
# join the test data
testdata = pd.merge(etccdi_future, wc_future, on="HID")

In [15]:
# only select the necessary rows
testdata = testdata[testdata["HID"].isin(list(testarea.HID))]

In [18]:
# checking if out of scope area of wc variables exist in test data - no
testdata[(testdata==-1000.000000).any(axis=1)]

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO10_Mean,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean


In [19]:
testdata.sample(5)

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO10_Mean,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean
158665,158666,29.821824,0.146827,26.135645,7.556161,0.042231,365.0,0.0,1570.663895,42.497514,...,30.47579,20.746958,1693.575629,408.449655,16.490914,85.378949,942.891389,66.764694,657.858292,74.09621
80813,80814,15.87524,0.168534,10.484239,9.844799,195.632939,164.855586,122.83562,669.158571,10.0691,...,16.907621,-13.917046,621.325675,74.264072,28.584175,26.764273,204.499207,99.61352,204.008633,111.027895
120070,120071,18.223077,0.074318,13.559308,10.405515,1.023838,365.0,0.0,1518.809177,47.719293,...,28.384518,16.812033,1442.599495,151.07264,72.152057,20.197696,428.013374,254.314001,380.480282,273.623202
15452,15453,19.909543,0.0,9.18122,10.391591,83.360017,277.85061,25.13274,863.109055,25.390243,...,27.25798,-0.418179,946.52421,114.83222,34.555859,36.443823,330.188135,111.371876,314.167877,111.371876
65271,65272,63.913911,0.07998,7.756304,2.700287,4.629433,358.344942,1.405856,309.17586,4.960609,...,29.524045,-1.473013,123.844545,13.171993,6.741111,18.990314,36.338206,24.596471,27.40759,29.579462


In [21]:
# select the variables used in the models of the previous research
test_GLM = testdata[["HID", "BIO03_Mean", "TN10P_IDW1N10","GSL_IDW1N10","TNX_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10"]]
test_SGLM = testdata[["HID", "BIO08_Mean", "TXX_IDW1N10", "BIO02_Mean", "TN90P_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10"]]

In [27]:
test_GLM.describe()

Unnamed: 0,HID,BIO03_Mean,TN10P_IDW1N10,GSL_IDW1N10,TNX_IDW1N10,ID_IDW1N10,BIO14_Mean,BIO18_Mean,CWD_IDW1N10,RX1DAY_IDW1N10,WSDI_IDW1N10
count,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0
mean,65498.293066,43.713777,0.688865,291.973816,26.002702,42.242325,21.473698,223.489826,20.203475,47.147951,127.998674
std,43472.279822,20.348339,0.597581,102.287009,6.352885,72.965724,35.46209,211.340723,19.168752,29.014818,70.667772
min,75.0,7.237353,0.0,0.0,-2.078189,0.0,0.0,0.0,1.857493,4.791341,12.760962
25%,37378.25,25.0,0.168401,215.593079,22.946993,0.0,1.304898,81.031857,8.194309,24.716783,69.421049
50%,59335.5,42.071638,0.573106,364.355921,26.983166,0.0,9.625291,177.489067,12.203885,40.473165,109.815795
75%,82805.75,59.090257,1.059406,365.0,30.364863,65.205926,25.977391,301.591554,26.009921,62.202034,177.525166
max,190595.0,93.255617,4.252464,365.0,40.671134,362.243492,499.798616,3751.166026,181.446663,211.404602,364.81105


In [28]:
# write out the test data
test_GLM.to_csv("testdata_GLM.csv", index=False)
test_SGLM.to_csv("testdata_SGLM.csv", index=False)