# Creating the training and test sets from previous research

This file creates train and test data from the master data files. These should be the same data used in the original research.

In [38]:
import pandas as pd
from pathlib import Path

In [39]:
datadir = Path("../")
elephasf = datadir / "Elephas_Maximus"

etccdi_presentf = datadir / "Climate"/ "ISEA3H09_CCSM4_Y1950_Y2000_ETCCDI_IDW1N10.txt"
etccdi_futuref = datadir / "Climate"/ "ISEA3H09_CCSM4_Y2061_Y2080_ETCCDI_IDW1N10.txt"
wc_presentf = datadir / "Climate"/ "ISEA3H09_WC30AS_V14_BIO.txt"
wc_futuref = datadir / "Climate"/ "ISEA3H09_WC30AS_V14_CMIP5_CCSM4_RCP85_2070_BIO.txt"

In [40]:
# read the data
etccdi_present = pd.read_csv(etccdi_presentf, sep="\t")
etccdi_future = pd.read_csv(etccdi_futuref, sep="\t")
wc_present = pd.read_csv(wc_presentf, sep="\t")
wc_future = pd.read_csv(wc_futuref, sep="\t")

labels = pd.read_csv(elephasf/"Elephas_Maximus_PA_Natural_O20.txt", sep="\t")
CVfolds = pd.read_csv(elephasf/"Elephas_Maximus_Folds_S10_Natural_O20.txt", sep="\t")
testarea = pd.read_csv(elephasf/"Elephas_Maximus_Predictions_GLM_Y1950_Y2000.txt", sep="\t")

In [41]:
# variables that need conversion i.e. need to be divided by 10.
need_to_convert = ["BIO01_Mean", "BIO02_Mean", "BIO08_Mean", "BIO09_Mean", "BIO10_Mean", "BIO11_Mean",
                   "BIO04_Mean", "BIO05_Mean", "BIO06_Mean", "BIO07_Mean"]

# convert the degrees
wc_present[need_to_convert] = wc_present[need_to_convert]/10
wc_future[need_to_convert] = wc_future[need_to_convert]/10

In [42]:
# join the training data
traindata = pd.merge(etccdi_present, wc_present, on="HID")

In [43]:
# only select the necessary rows
traindata = traindata[traindata["HID"].isin(list(labels.HID))]

In [44]:
# join the training data with CV folds and Elephas presence labels
traindata = pd.merge(CVfolds[["HID","I005"]],  traindata, on="HID")
traindata = pd.merge(traindata, labels, on="HID")

traindata = traindata.rename(columns={"I005":"Folds"})

In [45]:
# checking if out of scope area of wc variables exist in training data - no
traindata[(traindata==-1000.000000).any(axis=1)]

Unnamed: 0,HID,Folds,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,...,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean,PA


In [46]:
traindata.sample(5)

Unnamed: 0,HID,Folds,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,...,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean,PA
2047,67360,6,110.19356,9.785968,8.299338,11.933886,7.025889,363.452143,0.023199,273.217126,...,14.075103,122.092233,24.554257,2.0,73.015235,55.754425,6.907795,33.114551,51.606193,1
7246,162137,4,6.736193,3.937255,48.827918,5.728062,0.0,365.0,0.0,2957.975147,...,22.824125,3357.335312,363.180718,220.912807,12.818783,945.535645,707.613726,872.083136,776.039183,1
1291,63570,6,33.399751,6.647204,6.714438,6.401224,40.259077,310.030832,9.941737,331.160653,...,-3.705134,257.712807,31.602284,13.22293,28.372784,89.627749,43.677009,85.079687,48.850332,0
5893,84746,5,56.378552,3.21966,10.789917,9.175158,146.053233,219.001023,70.637324,771.917421,...,-8.180185,647.723241,199.142715,2.946219,117.814449,459.127398,10.071576,459.127398,10.071576,0
665,61845,1,147.520172,9.110276,3.966389,13.418182,9.120871,362.4002,0.001847,83.674714,...,11.356894,158.580236,30.346538,0.0,81.478759,71.67688,0.0,0.0,58.785937,0


In [47]:
# check number of data points in each fold - looks well balanced
traindata.groupby(["Folds"]).size()

Folds
1     732
2     732
3     732
4     732
5     732
6     734
7     734
8     734
9     734
10    735
dtype: int64

In [48]:
# select the variables used in the models of the previous research
train_GLM = traindata[["HID", "Folds", "BIO03_Mean", "TN10P_IDW1N10","GSL_IDW1N10","TNX_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10", "PA"]]
train_SGLM = traindata[["HID", "Folds", "BIO08_Mean", "TXX_IDW1N10", "BIO02_Mean", "TN90P_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10", "PA"]]

In [49]:
train_SGLM.describe()

Unnamed: 0,HID,Folds,BIO08_Mean,TXX_IDW1N10,BIO02_Mean,TN90P_IDW1N10,ID_IDW1N10,BIO14_Mean,BIO18_Mean,CWD_IDW1N10,RX1DAY_IDW1N10,WSDI_IDW1N10,PA
count,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0,7331.0
mean,87185.705088,5.504024,20.040758,38.769448,11.783774,11.052974,16.931569,17.404982,284.159281,20.188185,53.494516,6.963563,0.513573
std,34406.803105,2.872759,7.850105,5.732023,2.653852,0.566438,34.132812,38.80665,312.837901,16.826197,29.050981,3.061232,0.49985
min,48763.0,1.0,-9.540702,22.829643,5.241565,9.092302,0.0,0.0,0.0,2.458859,7.346992,0.768632,0.0
25%,66229.5,3.0,13.891883,34.375288,9.611574,10.66186,0.0,0.075074,13.953416,6.420884,24.46215,4.452838,0.0
50%,72695.0,6.0,22.399398,38.224824,11.866559,11.031791,0.134977,2.806513,176.938485,12.327726,60.304804,6.794981,1.0
75%,83199.5,8.0,26.307707,44.085135,13.871231,11.45737,11.781081,12.773083,482.204501,33.526784,76.01288,9.467519,1.0
max,162870.0,10.0,36.092831,51.419834,18.18636,12.777314,158.660532,278.746428,3020.802083,92.68243,177.918199,15.2352,1.0


In [50]:
# write out the training data
# train_GLM.to_csv("traindata_GLM.csv", index=False)
# train_SGLM.to_csv("traindata_SGLM.csv", index=False)

In [18]:
# join the test data
testdata = pd.merge(etccdi_future, wc_future, on="HID")

# also making a test set to check performance in present conditions
testpresent = pd.merge(etccdi_present, wc_present, on="HID")

In [19]:
# only select the necessary rows
testdata = testdata[testdata["HID"].isin(list(testarea.HID))]
testpresent = testpresent[testpresent["HID"].isin(list(testarea.HID))]

In [18]:
# checking if out of scope area of wc variables exist in test data - no
testdata[(testdata==-1000.000000).any(axis=1)]

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO10_Mean,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean


In [21]:
# checking if out of scope area of wc variables exist in present-day test data - no
testpresent[(testpresent==-1000.000000).any(axis=1)]

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO10_Mean,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean


In [22]:
testdata.sample(5)

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO10_Mean,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean
52468,52469,36.040322,0.035217,61.763045,9.510214,0.0,365.0,0.0,2366.618288,90.838204,...,28.510809,26.996947,1896.932303,274.461995,22.886447,56.45915,786.578883,95.909349,326.40925,773.208668
40000,40001,14.528375,0.422214,15.85957,9.438359,11.120743,359.667261,0.159477,967.612428,24.554825,...,22.389066,5.247957,694.271373,73.995663,43.505819,15.632669,205.532987,147.930572,153.875375,198.275598
82487,82488,26.290566,0.130773,11.462262,12.678976,206.703961,174.379857,110.832628,589.591584,11.606384,...,17.737398,-16.924259,716.186459,164.711832,13.999026,81.174343,401.903776,47.994674,401.903776,48.540222
5046,5047,13.66297,0.287723,14.353363,8.45145,221.049172,130.079229,116.013716,985.212656,21.140442,...,16.362631,-13.80727,323.426501,56.920858,8.339492,52.254516,144.837065,37.709658,144.316497,60.033687
165100,165101,37.552172,0.0,7.157132,11.305616,0.0,365.0,0.0,495.470761,13.418774,...,26.676192,15.101206,200.660532,23.26957,12.031331,20.401893,58.424274,40.500924,57.921606,50.43112


In [23]:
testpresent.sample(5)

Unnamed: 0,HID,CDD_IDW1N10,CSDI_IDW1N10,CWD_IDW1N10,DTR_IDW1N10,FD_IDW1N10,GSL_IDW1N10,ID_IDW1N10,PRCPTOT_IDW1N10,R10MM_IDW1N10,...,BIO10_Mean,BIO11_Mean,BIO12_Mean,BIO13_Mean,BIO14_Mean,BIO15_Mean,BIO16_Mean,BIO17_Mean,BIO18_Mean,BIO19_Mean
6768,6769,26.396491,3.678816,5.907739,9.103508,290.800283,46.614952,240.056885,273.546547,2.850864,...,5.20018,-32.578759,153.908562,30.513181,3.360411,70.59736,81.360031,11.360027,72.64833,11.360027
63461,63462,146.286739,6.728983,4.738813,10.546002,0.0,365.0,0.0,113.828503,2.755602,...,34.803122,18.491918,63.878843,15.262049,0.0,95.243693,40.28934,0.0,0.0,32.95015
52040,52041,82.957998,8.960111,28.908867,8.616918,0.0,365.0,0.0,1281.795166,44.795896,...,24.903483,20.668909,991.324571,217.789802,0.0,87.295542,482.62801,1.046959,481.572191,1.046959
60115,60116,66.743255,9.873656,16.243258,9.231759,0.0,365.0,0.0,573.216539,11.886623,...,24.002825,20.639499,408.720188,82.908826,1.974763,74.984557,188.07914,11.183605,187.987708,18.106666
9513,9514,15.34422,3.096573,15.379549,10.435348,240.136666,122.151234,131.138117,816.354004,14.593778,...,7.4617,-12.622265,614.124211,88.619386,28.496676,39.249671,246.33201,99.962257,246.33201,113.207331


In [26]:
# select the variables used in the models of the previous research
test_GLM = testdata[["HID", "BIO03_Mean", "TN10P_IDW1N10","GSL_IDW1N10","TNX_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10"]]
test_SGLM = testdata[["HID", "BIO08_Mean", "TXX_IDW1N10", "BIO02_Mean", "TN90P_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10"]]

In [28]:
# do the same for the present-day test data
testpres_GLM = testpresent[["HID", "BIO03_Mean", "TN10P_IDW1N10","GSL_IDW1N10","TNX_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10"]]
testpres_SGLM = testpresent[["HID", "BIO08_Mean", "TXX_IDW1N10", "BIO02_Mean", "TN90P_IDW1N10",
                       "ID_IDW1N10", "BIO14_Mean","BIO18_Mean", "CWD_IDW1N10", "RX1DAY_IDW1N10","WSDI_IDW1N10"]]

In [29]:
test_GLM.describe()

Unnamed: 0,HID,BIO03_Mean,TN10P_IDW1N10,GSL_IDW1N10,TNX_IDW1N10,ID_IDW1N10,BIO14_Mean,BIO18_Mean,CWD_IDW1N10,RX1DAY_IDW1N10,WSDI_IDW1N10
count,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0
mean,65498.293066,43.713777,0.688865,291.973816,26.002702,42.242325,21.473698,223.489826,20.203475,47.147951,127.998674
std,43472.279822,20.348339,0.597581,102.287009,6.352885,72.965724,35.46209,211.340723,19.168752,29.014818,70.667772
min,75.0,7.237353,0.0,0.0,-2.078189,0.0,0.0,0.0,1.857493,4.791341,12.760962
25%,37378.25,25.0,0.168401,215.593079,22.946993,0.0,1.304898,81.031857,8.194309,24.716783,69.421049
50%,59335.5,42.071638,0.573106,364.355921,26.983166,0.0,9.625291,177.489067,12.203885,40.473165,109.815795
75%,82805.75,59.090257,1.059406,365.0,30.364863,65.205926,25.977391,301.591554,26.009921,62.202034,177.525166
max,190595.0,93.255617,4.252464,365.0,40.671134,362.243492,499.798616,3751.166026,181.446663,211.404602,364.81105


In [30]:
testpres_GLM.describe()

Unnamed: 0,HID,BIO03_Mean,TN10P_IDW1N10,GSL_IDW1N10,TNX_IDW1N10,ID_IDW1N10,BIO14_Mean,BIO18_Mean,CWD_IDW1N10,RX1DAY_IDW1N10,WSDI_IDW1N10
count,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0,51514.0
mean,65498.293066,44.236101,10.074964,273.286782,22.072859,55.369352,20.858891,225.913868,21.283695,38.854412,6.966664
std,43472.279822,20.620452,0.466503,117.094126,6.364149,86.765263,33.740514,203.341948,22.178228,22.030322,3.018538
min,75.0,9.0,7.797225,0.0,-6.386312,0.0,0.0,0.0,1.650057,4.752302,0.355019
25%,37378.25,25.14592,9.761226,180.02307,18.846297,0.0,1.369253,80.017627,7.782282,21.749304,4.864594
50%,59335.5,42.99539,10.025021,359.746837,23.398786,0.01692,9.04993,186.088032,11.570322,34.873486,6.78137
75%,82805.75,58.952827,10.339781,365.0,26.481839,104.484313,24.985047,306.213966,26.968509,51.059543,8.737834
max,190595.0,92.999668,12.680949,365.0,36.102259,364.997745,477.689918,3020.802083,192.748037,177.918199,27.631467


In [28]:
# write out the test data
# test_GLM.to_csv("testdata_GLM.csv", index=False)
# test_SGLM.to_csv("testdata_SGLM.csv", index=False)

In [31]:
# write out the present-day test data
#testpres_GLM.to_csv("testdata_pres_GLM.csv", index=False)
#testpres_SGLM.to_csv("testdata_pres_SGLM.csv", index=False)