# Decision Tree Regresion

##Imports

In [None]:
import io
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import files
from sklearn.tree import DecisionTreeRegressor

## Load Data

In [None]:
archive = files.upload()

Saving dengue_features_test.csv to dengue_features_test.csv
Saving dengue_features_train.csv to dengue_features_train.csv
Saving dengue_labels_train.csv to dengue_labels_train.csv


In [None]:
df_features = pd.read_csv('dengue_features_train.csv', sep=',')
df_labels = pd.read_csv('dengue_labels_train.csv', sep=',')

train = pd.merge(df_features, df_labels, on=['city', 'year', 'weekofyear'])
test = pd.read_csv('dengue_features_test.csv', sep=',')

train.head()

Unnamed: 0,city,year,weekofyear,week_start_date,ndvi_ne,ndvi_nw,ndvi_se,ndvi_sw,precipitation_amt_mm,reanalysis_air_temp_k,reanalysis_avg_temp_k,reanalysis_dew_point_temp_k,reanalysis_max_air_temp_k,reanalysis_min_air_temp_k,reanalysis_precip_amt_kg_per_m2,reanalysis_relative_humidity_percent,reanalysis_sat_precip_amt_mm,reanalysis_specific_humidity_g_per_kg,reanalysis_tdtr_k,station_avg_temp_c,station_diur_temp_rng_c,station_max_temp_c,station_min_temp_c,station_precip_mm,total_cases
0,sj,1990,18,1990-04-30,0.1226,0.103725,0.198483,0.177617,12.42,297.572857,297.742857,292.414286,299.8,295.9,32.0,73.365714,12.42,14.012857,2.628571,25.442857,6.9,29.4,20.0,16.0,4
1,sj,1990,19,1990-05-07,0.1699,0.142175,0.162357,0.155486,22.82,298.211429,298.442857,293.951429,300.9,296.4,17.94,77.368571,22.82,15.372857,2.371429,26.714286,6.371429,31.7,22.2,8.6,5
2,sj,1990,20,1990-05-14,0.03225,0.172967,0.1572,0.170843,34.54,298.781429,298.878571,295.434286,300.5,297.3,26.1,82.052857,34.54,16.848571,2.3,26.714286,6.485714,32.2,22.8,41.4,4
3,sj,1990,21,1990-05-21,0.128633,0.245067,0.227557,0.235886,15.36,298.987143,299.228571,295.31,301.4,297.0,13.9,80.337143,15.36,16.672857,2.428571,27.471429,6.771429,33.3,23.3,4.0,3
4,sj,1990,22,1990-05-28,0.1962,0.2622,0.2512,0.24734,7.52,299.518571,299.664286,295.821429,301.9,297.5,12.2,80.46,7.52,17.21,3.014286,28.942857,9.371429,35.0,23.9,5.8,6


In [None]:
def remove_null_values(dataframe):
  if pd.isnull(dataframe).any().any():
    dataframe.fillna(method='ffill', inplace=True)

train.apply(remove_null_values)
test.apply(remove_null_values)

print(pd.isnull(train).any().any())
print(pd.isnull(test).any().any())

False
False


## Decision Tree Algorithm

### Both cities

#### Select Features

In [None]:
features = ['weekofyear', 'ndvi_sw', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_min_air_temp_k']

regressor = DecisionTreeRegressor(max_depth=len(features))
regressor.fit(train[features], train['total_cases'])
pd.DataFrame({'Feature': features, 'Decision Tree': regressor.feature_importances_})

Unnamed: 0,Feature,Decision Tree
0,weekofyear,0.177123
1,ndvi_sw,0.512721
2,reanalysis_specific_humidity_g_per_kg,0.056211
3,reanalysis_min_air_temp_k,0.253945


##### Apply Algorithm

In [None]:
regressor = DecisionTreeRegressor(max_depth=len(features))
regressor.fit(train[features], train['total_cases'])
Y_predict = regressor.predict(test[features])

Y = np.rint(Y_predict)
Y = Y.astype(int)
result = np.hstack(Y)

print(result)

[ 10  10  10  10  13  14  14  14  14  14  14  14  43  43  61  93  43  61
  76  61  61  43  43  43  43  43  32  43  43  43  43  48  48  43  11  64
  64  41  26  26  26  26  26  26  26  14  14  14   8  10  10  10  10  10
  10  10  14  14  13  14  14  14  14  14  43  43  43  43  43  61  43  43
  32  43  61  61  61  43  43  43  43  43  43  43  43  43  43  43  26  26
  26  26  26  16  26  26  14   8  14  14  14  10  10  10  31  14  14  14
  14  13  42  42  42  42  42  42  93  61  61  93  61 229  76  76 229  61
  43  61 229  43  43  43  43  48  43  48  11  19  19  48  26  26  16  26
  26  26  26  26  26  26   8  14  14  10  10  10  10  10  10  10  10  14
  31  42  42  42  42  42  14  61  43  76  61  76  61  43  61  43  61  61
  43  43  43  43  43  43  43  43  43  43  48  48  41  26  26  26  26  26
  26  16  26  26  14  14  14  10  10  10  10  13  10  31  14  14  14  14
  14  14  14  14  14  43  43  43  93  43  43  43  43  43  43  43  43  61
  43  43  43  43  32  43  43  34  34  26  41  26  2

##### Output

In [None]:
# generate output
output = pd.DataFrame({ 'city': test['city'], 'year': test['year'], 'weekofyear': test['weekofyear'], 'total_cases': result})

with open('result.csv', 'w') as f:
  output.to_csv(f,  index = False)
  

files.download('result.csv')

output.head()

### Divided By Cities

In [None]:
train_sj = train[train['city'] == 'sj']
train_iq = train[train['city'] == 'iq']

test_sj = test[test['city'] == 'sj']
test_iq = test[test['city'] == 'iq']

#### San Juan

##### Select Features

In [None]:
features_sj = ['weekofyear', 'ndvi_sw', 'reanalysis_specific_humidity_g_per_kg']

regressor_sj = DecisionTreeRegressor(max_depth=len(features_sj))
regressor_sj.fit(train_sj[features_sj], train_sj['total_cases'])
pd.DataFrame({'Feature': features_sj, 'Decision Tree': regressor_sj.feature_importances_})

Unnamed: 0,Feature,Decision Tree
0,weekofyear,0.214504
1,ndvi_sw,0.722324
2,reanalysis_specific_humidity_g_per_kg,0.063173


##### Apply Algorithm

In [None]:
regressor_sj = DecisionTreeRegressor(max_depth=len(features_sj))
regressor_sj.fit(train_sj[features_sj], train_sj['total_cases'])
Y_predict_sj = regressor_sj.predict(test_sj[features_sj])

Y_sj = np.rint(Y_predict_sj)
Y_sj = Y_sj.astype(int)
result_sj = np.hstack(Y_sj)

[ 15  15  15  15  15  15  15  15  15  15  15  15  15  43  44  43  44  44
 168  43  44  44  44  44  43  44  43  43  43  43  44  43  44  44  43  27
  27  27  27  43  15  15  15  15  15  15  15  15  15  15  15  15  15  15
  15  15  15  15  15  15  15  15  15  15  44  43  43  43  44  44  43  43
  43  43  43  44  44  44  44  43  43  43  43  44  43  44  43  43  43  43
  43  43  27  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15
  15  44  44  44  44  44  44  44  44  44  44 168  43  44 168 168 168  43
  43  43 168  44  44  43  43  43  44  43  44  44  43  43  27  43  43  27
  27  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15
  15  44  44  44  44  44  15  44  44  44  44 168  43  43  43  43  44  44
  43  43  43  43  43  43  43  44  44  43  43  43  27  27  27  27  27  15
  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15  15
  15  15  15  15  15  44  43  43  43  43  43  44  44  44  43  43  43  43
  43  43  43  43  43  43  43  43  43  43  27  27  2

####Iquitos

##### Select Features

In [None]:
features_iq = ['weekofyear', 'ndvi_sw', 'reanalysis_specific_humidity_g_per_kg', 'station_min_temp_c']

regressor_iq = DecisionTreeRegressor(max_depth=len(features_iq))
regressor_iq.fit(train_iq[features_iq], train_iq['total_cases'])
pd.DataFrame({'Feature': features_iq, 'Decision Tree': regressor_iq.feature_importances_})

Unnamed: 0,Feature,Decision Tree
0,weekofyear,0.23342
1,ndvi_sw,0.196276
2,reanalysis_specific_humidity_g_per_kg,0.441176
3,station_min_temp_c,0.129128


##### Apply Algorithm

In [None]:
regressor_iq = DecisionTreeRegressor(max_depth=len(features_iq))
regressor_iq.fit(train_iq[features_iq], train_iq['total_cases'])
Y_predict_iq = regressor_iq.predict(test_iq[features_iq])

Y_iq = np.rint(Y_predict_iq)
Y_iq = Y_iq.astype(int)
result_iq = np.hstack(Y_iq)

[ 4  3  7  4  3  3  3  4  3  3  4  3  9  9  9  9  7  8  8  8  8  8  8  8
  8  2  2  6  6  6  6 14 13 13 13 13  7  7  7  7  7  7  7  7  7  3  4  3
  4  3  3  4  4  4  3  3  3  3  3  3  3  3  4  4  9  9  9  7  7  8  2  8
  8  8  8  8  8 41  8 13 13 14  6 13 13  6 14  6  7  4  3  7  4  7  7  7
  7  3  7  4  4  4  4  3  4  4  4  3  3  3  3  3  3  4  3  3  9  9  9  7
  9  8  8  8 10  8  8  8  8  8 13 14  6 13  6 13 13 13 29  7  7  7  7  7
  7  7  7  3  7  7  3  3  3  3  3  3]


#### Result

In [None]:
final_result = np.append(result_sj, result_iq,axis=None)
final_result

array([ 15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,
        43,  44,  43,  44,  44, 168,  43,  44,  44,  44,  44,  43,  44,
        43,  43,  43,  43,  44,  43,  44,  44,  43,  27,  27,  27,  27,
        43,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,
        15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  44,
        43,  43,  43,  44,  44,  43,  43,  43,  43,  43,  44,  44,  44,
        44,  43,  43,  43,  43,  44,  43,  44,  43,  43,  43,  43,  43,
        43,  27,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,
        15,  15,  15,  15,  15,  44,  44,  44,  44,  44,  44,  44,  44,
        44,  44, 168,  43,  44, 168, 168, 168,  43,  43,  43, 168,  44,
        44,  43,  43,  43,  44,  43,  44,  44,  43,  43,  27,  43,  43,
        27,  27,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,  15,
        15,  15,  15,  15,  15,  15,  15,  44,  44,  44,  44,  44,  15,
        44,  44,  44,  44, 168,  43,  43,  43,  43,  44,  44,  4

#### Output

In [None]:
# generate output
output = pd.DataFrame({ 'city': test['city'], 'year': test['year'], 'weekofyear': test['weekofyear'], 'total_cases': final_result})

with open('result.csv', 'w') as f:
  output.to_csv(f,  index = False)
  

files.download('result.csv')

output.head()

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,city,year,weekofyear,total_cases
0,sj,2008,18,15
1,sj,2008,19,15
2,sj,2008,20,15
3,sj,2008,21,15
4,sj,2008,22,15


## Points


---
POINTS: 25.7284

FEATURES: ['weekofyear', 'station_max_temp_c', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_min_air_temp_k']

---
POINTS: 25.2788

FEATURES: ['weekofyear', 'ndvi_sw', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_min_air_temp_k']

---
POINTS: 25.6058

FEATURES: ['weekofyear', 'ndvi_sw', 'ndvi_se', 'reanalysis_specific_humidity_g_per_kg', 'reanalysis_min_air_temp_k']

---

POINTS: 25.1587

FEATURES:
  
>  SAN JUAN: ['weekofyear', 'ndvi_sw', 'reanalysis_specific_humidity_g_per_kg']
  
>  IQUITOS: ['weekofyear', 'ndvi_sw', 'reanalysis_specific_humidity_g_per_kg', 'station_min_temp_c']