# Exploratory Data Analysis

In [329]:
import pandas as pd
import numpy as np
import altair as alt
from sklearn.model_selection import train_test_split

## Formulate your question

- It’s usually a good idea to spend a few minutes to figure out
what is the question you’re really interested in, and narrow
it down to be as specific as possible (without becoming
uninteresting).
-  In particular, a sharp
question or hypothesis can serve as a dimension reduction
tool that can eliminate variables that are not immediately
relevant to the question.

Can we predict the burned areas of forests?

---

## Read in the data

In [226]:
forest_data = pd.read_csv("forestfires.csv")
forest_data

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.00
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.00
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.00
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.00
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.00


---

## Check the packaging

In [227]:
# Number of columns and rows
forest_data.shape

(517, 13)

In [228]:
# Information for each column
forest_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 13 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   X       517 non-null    int64  
 1   Y       517 non-null    int64  
 2   month   517 non-null    object 
 3   day     517 non-null    object 
 4   FFMC    517 non-null    float64
 5   DMC     517 non-null    float64
 6   DC      517 non-null    float64
 7   ISI     517 non-null    float64
 8   temp    517 non-null    float64
 9   RH      517 non-null    int64  
 10  wind    517 non-null    float64
 11  rain    517 non-null    float64
 12  area    517 non-null    float64
dtypes: float64(8), int64(3), object(2)
memory usage: 52.6+ KB


In [229]:
# Missing values for ecah column
forest_data.isnull().sum()

X        0
Y        0
month    0
day      0
FFMC     0
DMC      0
DC       0
ISI      0
temp     0
RH       0
wind     0
rain     0
area     0
dtype: int64

---

## Looking at the top and the bottom of the data

In [230]:
forest_data.head()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [231]:
forest_data.tail()

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
512,4,3,aug,sun,81.6,56.7,665.6,1.9,27.8,32,2.7,0.0,6.44
513,2,4,aug,sun,81.6,56.7,665.6,1.9,21.9,71,5.8,0.0,54.29
514,7,4,aug,sun,81.6,56.7,665.6,1.9,21.2,70,6.7,0.0,11.16
515,1,4,aug,sat,94.4,146.0,614.7,11.3,25.6,42,4.0,0.0,0.0
516,6,3,nov,tue,79.5,3.0,106.7,1.1,11.8,31,4.5,0.0,0.0


---

## Check the "n"s

In [232]:
train_set, test_set = train_test_split(
    forest_data, test_size = 0.2, random_state = 123
)

In [233]:
train_set.shape

(413, 13)

In [234]:
train_set["X"].value_counts()

6    74
4    71
2    56
7    49
8    46
3    46
1    40
5    24
9     7
Name: X, dtype: int64

In [235]:
train_set["Y"].value_counts() # there are no 1 and 7s

4    164
5    104
6     53
3     52
2     37
9      2
8      1
Name: Y, dtype: int64

In [236]:
train_set["month"].value_counts() # mostly in August and September

aug    149
sep    138
mar     45
jul     22
feb     17
jun     13
oct     12
apr      8
dec      6
jan      1
nov      1
may      1
Name: month, dtype: int64

In [237]:
train_set["day"].value_counts() # Mostly during weekend

sun    73
fri    70
sat    69
mon    57
thu    51
tue    48
wed    45
Name: day, dtype: int64

In [238]:
# weekend
73 + 70 + 69

212

In [239]:
# weekday
57 + 51 + 48 + 45

201

In [240]:
# range of values
train_set.describe()

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
count,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0,413.0
mean,4.62954,4.237288,90.771429,109.854237,546.031235,8.971671,18.819613,44.353511,4.085714,0.026634,13.868329
std,2.278178,1.164551,4.655424,63.576254,251.835608,4.581362,5.789594,16.476107,1.813679,0.330882,69.84273
min,1.0,2.0,50.4,2.4,7.9,0.4,4.2,15.0,0.4,0.0,0.0
25%,3.0,4.0,90.2,61.1,433.3,6.4,15.4,33.0,2.7,0.0,0.0
50%,4.0,4.0,91.7,108.0,664.5,8.4,19.3,42.0,4.0,0.0,0.52
75%,6.0,5.0,92.9,141.3,713.9,10.7,22.9,53.0,5.4,0.0,6.58
max,9.0,9.0,96.2,291.3,860.6,56.1,33.3,99.0,9.4,6.4,1090.84


In [308]:
# Correlation matrix
train_set.corr(method = 'spearman')  # spearman is more robust than to the effect of outliers compared to Pearson's 

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area,log_area
X,1.0,0.462903,-0.036634,-0.063781,-0.062645,0.00096,-0.091185,0.110238,0.054414,0.115815,0.116381,0.115853
Y,0.462903,1.0,0.05825,0.017757,-0.085801,0.034649,-0.030783,0.040472,-0.00124,0.07272,0.052622,0.052715
FFMC,-0.036634,0.05825,1.0,0.523017,0.257547,0.776041,0.613044,-0.33096,-0.049079,0.121794,0.058037,-0.034666
DMC,-0.063781,0.017757,0.523017,1.0,0.566408,0.429883,0.535974,0.017812,-0.121018,0.121047,0.082359,-0.001323
DC,-0.062645,-0.085801,0.257547,0.566408,1.0,0.094943,0.316728,0.041631,-0.231783,0.023028,0.074541,-0.027442
ISI,0.00096,0.034649,0.776041,0.429883,0.094943,1.0,0.426436,-0.190883,0.113262,0.122755,0.051099,-0.108711
temp,-0.091185,-0.030783,0.613044,0.535974,0.316728,0.426436,1.0,-0.496152,-0.16438,0.054985,0.091627,-0.022034
RH,0.110238,0.040472,-0.33096,0.017812,0.041631,-0.190883,-0.496152,1.0,0.011832,0.182781,-0.050826,-0.121734
wind,0.054414,-0.00124,-0.049079,-0.121018,-0.231783,0.113262,-0.16438,0.011832,1.0,0.110012,0.05218,0.068623
rain,0.115815,0.07272,0.121794,0.121047,0.023028,0.122755,0.054985,0.182781,0.110012,1.0,-0.058119,-0.010624


---

## Validate with at least one external data source

- Check wether the ranges of the data makes sense or not.

---

## Making plots

### Dependent Variable (area)

In [242]:
alt.Chart(train_set).mark_bar().encode(
    alt.X("area", bin = alt.Bin(maxbins = 30)),
    y = "count()"
) 
# very skewed with high Kurtosis -- two-part model? (strictly positive variable with a large number of 0s)
# binomial distribution and a strict positive distribution.

In [243]:
# log transformation
train_set["log_area"] = np.log(train_set["area"].replace(0, np.nan))

alt.Chart(train_set).mark_bar().encode(
    alt.X("log_area", 
           bin = alt.Bin(maxbins = 20),
           title = "Area Burnt (Log Transformation)"),
    y = alt.Y("count()")
) 
# looks approximately normal

### Predictors

#### Categorical Columns

In [244]:
# can't simply look for count per category because there are 
# many areas with area == 0

In [328]:
# day (sqrt transformation) --> due to outliers
alt.Chart(train_set).mark_boxplot(size = 15).encode(
    x = alt.X("area", 
              scale = alt.Scale(type = "sqrt"),
              title = "Area Burnt (Square Root Transformation)"),
    y = alt.Y("day", 
              sort = "x",
              title = "Day of Week"),
    color = alt.Color("day",
                      legend = None)
).properties(
    height = 250,
    width = 450
)

# The burnt areas seem to be distributed the same across the days of the week

In [326]:
# month (sqrt transformation) --> due to outliers
alt.Chart(train_set).mark_boxplot(size = 15).encode(
    x = alt.X("area", 
              scale = alt.Scale(type = "sqrt"),
              title = "Area Burnt (Square Root Transformation)"),
    y = alt.Y("month", 
              sort = "x",
              title = "Month"),
    color = alt.Color("month",
                      legend = None)
).properties(
    height = 250,
    width = 450
    
)
# As determined before, some months such as January, May and November 
# do not have many observations. 
# Since the months variable is unbalanced, to avoid overfitting, it 
# is a good idea to create a seasons variable.

In [304]:
# Seasons (sqrt transformation) --> due to outliers
season_mapping = {
    "dec" : "winter",
    "jan" : "winter",
    "feb" : "winter",
    "mar" : "spring",
    "apr" : "spring",
    "may" : "spring",
    "jun" : "summer",
    "jul" : "summer", 
    "aug" : "summer",
    "sep" : "fall",
    "oct" : "fall",
    "nov" : "fall"
}
train_set["season"] = train_set["month"].map(season_mapping)

alt.Chart(train_set).mark_boxplot(size = 20).encode(
    x = alt.X("area", 
              scale = alt.Scale(type = "sqrt"),
              title = "Area Burnt (Square Root Transformation)"),
    y = alt.Y("season", 
              sort = "x",
              title = "Season"),
    color = alt.Color("season",
                      legend = None)
).properties(
    height = 200,
    width = 450
)

#### Numeric Columns

In [248]:
# location of the burnt areas --> (8, 7) and (6, 5) stand out!
alt.Chart(train_set).mark_circle().encode(
    x = alt.X("X",
              title = "X-axis Spacial Coordinate"),
    y = alt.Y("Y",
              title = "Y-axis Spacial Coordinate"),
    size = alt.Size("area",
                    scale = alt.Scale(range = (20, 1500)),
                    title = "Burnt Area")
).configure_mark(
    color = "red",
    opacity = 0.7
)

In [322]:
# pairplot between numeric variables and ouliers 
# (keep these in mind when building your model)

alt.Chart(train_set).mark_circle().encode(
    x = alt.X(alt.repeat("row"), type = "quantitative"),
    y = alt.Y(alt.repeat("column"), type = "quantitative"),
    color = "season"
).properties(
    width = 100,
    height = 100
).repeat(
    column = ["FFMC", "DMC", "DC", "ISI", "temp", "RH", "wind", "rain"],
    row = ["FFMC", "DMC", "DC", "ISI", "temp", "RH", "wind", "rain"]
).configure_mark(
    opacity = 0.4,
).interactive()


In [319]:
# Dropping numerical columns that are categorical in nature
train_df_numeric = train_set.drop(["X", "Y", "month", "day"], axis=1)

corr_df = train_df_numeric.corr("spearman").stack().reset_index(name="corr")
corr_df.loc[corr_df["corr"] == 1, "corr"] = 0
corr_df["abs"] = corr_df["corr"].abs()

(
    alt.Chart(corr_df)
    .mark_circle()
    .encode(x = "level_0",
            y = "level_1",
            size = "abs",
            color = alt.Color('corr',
                            scale=alt.Scale(scheme = 'blueorange',
                                            domain = (-1, 1))))
).properties(
    width = 300,
    height = 300
)

---

## Trying the easy question first

---

## Follow up

1. Do you have the right data?
2. Do you need other data?
3. Do you have the right question?

---