# Practica APA - predicció superfície cremada d'un incendi
Sergi Curto Panisello,
Joan Melchor Lladó

## Imports

In [17]:
import pandas as pd
import numpy as np

## Obtenim les dades
Procedim a agafar les dades en format csv descarregades de https://datos.civio.es/dataset/todos-los-incendios-forestales/

In [2]:
df = pd.read_csv('fires-all.csv')
print(df.describe())

                 id    superficie           lat           lng  \
count  8.264000e+04  82640.000000  82616.000000  82616.000000   
mean   2.007259e+09     19.888085     41.763721     -5.664360   
std    4.247106e+06    223.787536      2.019672      4.394031   
min    2.001010e+09      1.000000      0.490720  -1000.000000   
25%    2.003390e+09      1.500000     40.876544     -7.257298   
50%    2.006330e+09      3.000000     42.371123     -6.019478   
75%    2.011150e+09      7.150000     43.143145     -4.303078   
max    2.015510e+09  28879.100000     87.824157    242.755603   

       latlng_explicit   idcomunidad   idprovincia   idmunicipio  \
count     82640.000000  82640.000000  82640.000000  82640.000000   
mean          0.741626      7.850278     28.411375     77.795946   
std           0.437743      5.419922     11.727155     98.867769   
min           0.000000      1.000000      1.000000      1.000000   
25%           0.000000      3.000000     21.000000     27.000000   
50%   

In [9]:
# Si separem solament Catalunya
cat = df.loc[df['idcomunidad'] == 2]
print(cat.head)
print('Shape')
print(cat.shape)


<bound method NDFrame.head of                id  superficie       fecha        lat       lng  \
423    2001080017        1.00  2001-02-22  41.667672  2.066639   
424    2001080032        6.00  2001-03-29  41.300741  1.666971   
425    2001080039        1.00  2001-04-07  41.474125  1.973610   
426    2001080041        1.33  2001-04-08  41.483501  1.584177   
427    2001080043        2.00  2001-04-08  41.511833  2.321732   
...           ...         ...         ...        ...       ...   
82298  2015430102        1.63  2015-11-25  41.266186  1.154032   
82299  2015430116       23.23  2015-06-09  41.175501  0.813588   
82300  2015430117       45.03  2015-08-24  40.940658  0.826569   
82301  2015430118       29.00  2015-08-29  41.268525  0.883916   
82302  2015430119        6.76  2015-10-22  41.004475  0.491856   

       latlng_explicit  idcomunidad  idprovincia  idmunicipio  \
423                  1            2            8          223   
424                  1            2            

## Preprocessing

### Tractament de missing values

In [14]:
# Columnes amb missing values, també es pot veure al describe del
# dataset a les variables que no tinguin un count de 82640
print(df.columns[df.isnull().any()].tolist())

# Sobre la "causa supuesta" és normal que hi hagin instàncies sense valor ja que 
# idmunicipio 999, i 998 ens carreguem els que no tinguin lat i long ja que són incendis originats fora d'Espanya

# Comencem per emplenar els missing values de muertos i heridos ja que
# si no hi han dades suposarem que són 0.
df['muertos'] = df['muertos'].fillna(0)
df['heridos'] = df['heridos'].fillna(0)

df.describe()

['lat', 'lng', 'causa_supuesta', 'gastos', 'perdidas']


Unnamed: 0,id,superficie,lat,lng,latlng_explicit,idcomunidad,idprovincia,idmunicipio,causa,causa_supuesta,causa_desc,muertos,heridos,time_ctrl,time_ext,personal,medios,gastos,perdidas
count,82640.0,82640.0,82616.0,82616.0,82640.0,82640.0,82640.0,82640.0,82640.0,46465.0,82640.0,82640.0,82640.0,82640.0,82640.0,82640.0,82640.0,11624.0,34349.0
mean,2007259000.0,19.888085,41.763721,-5.66436,0.741626,7.850278,28.411375,77.795946,3.695051,1.0,12.216917,0.000702,0.007357,236.247302,523.010733,21.188093,3.143224,7500.955,32306.66
std,4247106.0,223.787536,2.019672,4.394031,0.437743,5.419922,11.727155,98.867769,1.032923,0.0,24.631048,0.054333,0.143154,901.18681,2560.985781,48.210975,6.180303,39392.54,431591.2
min,2001010000.0,1.0,0.49072,-1000.0,0.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-28961.0
25%,2003390000.0,1.5,40.876544,-7.257298,0.0,3.0,21.0,27.0,4.0,1.0,0.0,0.0,0.0,64.0,134.0,5.0,1.0,356.0,102.0
50%,2006330000.0,3.0,42.371123,-6.019478,1.0,5.0,32.0,52.0,4.0,1.0,2.0,0.0,0.0,118.0,221.0,11.0,2.0,1107.0,1323.0
75%,2011150000.0,7.15,43.143145,-4.303078,1.0,14.0,36.0,92.0,4.0,1.0,10.0,0.0,0.0,210.0,415.0,23.0,3.0,3682.75,5738.0
max,2015510000.0,28879.1,87.824157,242.755603,1.0,18.0,51.0,999.0,6.0,1.0,99.0,11.0,12.0,132555.0,529682.0,3979.0,310.0,1426641.0,30640110.0


In [16]:
aux = df.loc[df['idmunicipio'] == 999]
latMissing = df.loc[np.isnan(df['lat'])]
print(aux.shape)
print(latMissing.shape)
print(latMissing)

(89, 21)
(24, 21)
               id  superficie       fecha  lat  lng  latlng_explicit  \
62     2001030064        1.00  2001-06-22  NaN  NaN                0   
1072   2001130024        4.00  2001-06-23  NaN  NaN                0   
1134   2001130126        1.00  2001-09-02  NaN  NaN                0   
2714   2001260056        5.00  2001-07-18  NaN  NaN                0   
13579  2002370039        2.00  2002-06-26  NaN  NaN                0   
13908  2002390277       27.50  2002-02-03  NaN  NaN                0   
14397  2002480039        3.50  2002-03-19  NaN  NaN                0   
14657  2002490315      140.90  2002-08-15  NaN  NaN                0   
16537  2003190144      122.00  2003-07-31  NaN  NaN                0   
17794  2003280197      207.70  2003-08-01  NaN  NaN                0   
17950  2003310067        1.00  2003-03-24  NaN  NaN                0   
17971  2003310097        2.00  2003-04-17  NaN  NaN                0   
28556  2004480064        1.38  2004-09-01  NaN

## Separem target del dataset
Volem predir la superfície cremada donat un incendi per tant separem aquesta variable de la resta.

In [5]:
X = df.drop('superficie', axis=1)
y = df['superficie']

# Comprovem que al target no hi hagin nulls
print(y.isnull().values.any())


False
