In [16]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pointbiserialr
from sampling import pps, accept_reject

PROPERTY = 'fire_size'

dfo = pd.read_csv('./data/FW_Veg_Rem_Combined.csv')

cont_columns_prefix = ['Prec_pre_', "Temp_pre_", "Wind_pre_", "Hum_pre_"]
columns_suffix = [7, 15, 30]

weather_columns = [ x + str(y) for x in cont_columns_prefix for y in columns_suffix]

def preprocess_missing_weather_data(df):
    df_subsample = df[df['Temp_pre_7'] == -1]
    df_clean = df[df['Temp_pre_7'] != -1]
    mean_weather_data = df_clean.groupby(['state', 'discovery_month'])[weather_columns].mean().reset_index()
    df_subsample.drop(weather_columns, axis=1, inplace=True)
    df_subsample = df_subsample.merge(mean_weather_data)
    df_clean = df_clean.append(df_subsample, ignore_index=True)
    return df_clean

df = preprocess_missing_weather_data(dfo)

causes = df.stat_cause_descr.unique()
bayesian_net_columns_cont = ['fire_size', 'latitude', 'longitude', 'Temp_pre_30', 'Temp_pre_15', 'Temp_pre_7', 
        'Temp_cont', 'Wind_pre_30', 'Wind_pre_15', 'Wind_pre_7', 'Wind_cont',
       'Hum_pre_30', 'Hum_pre_15', 'Hum_pre_7', 'Hum_cont', 'Prec_pre_30',
       'Prec_pre_15', 'Prec_pre_7', 'Prec_cont', 'remoteness']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [20]:
ress= []

for col in bayesian_net_columns_cont:
    row = { 'property': col }
    s = 0
    abs_sum = 0
    for cause in causes:
        df[cause] = df.stat_cause_descr.apply(lambda x: x == cause)
        r,p = pointbiserialr(df[cause], df[col])
        s += r
        abs_sum += abs(r)
        row[cause] = r
    row['mean'] = s/len(causes)
    row['abs_sum'] = abs_sum
    ress.append(row)

df_res = pd.DataFrame(ress)
print(df_res)

       property  Missing/Undefined     Arson  Debris Burning  Campfire  \
0     fire_size          -0.020008 -0.050966       -0.077812  0.000499   
1      latitude          -0.146258 -0.051947       -0.104620  0.017823   
2     longitude           0.073169  0.137974        0.196612  0.003758   
3   Temp_pre_30           0.018885 -0.100629       -0.152301 -0.031213   
4   Temp_pre_15           0.011799 -0.097339       -0.157080 -0.033186   
5    Temp_pre_7           0.008170 -0.095068       -0.156220 -0.034331   
6     Temp_cont           0.004033 -0.111757       -0.077161 -0.015318   
7   Wind_pre_30           0.091449 -0.049976       -0.068323 -0.015564   
8   Wind_pre_15           0.080753 -0.039965       -0.072475 -0.014304   
9    Wind_pre_7           0.074346 -0.030052       -0.067739 -0.013438   
10    Wind_cont           0.014486 -0.091204       -0.035546 -0.007180   
11   Hum_pre_30           0.048636  0.067344        0.129945  0.005554   
12   Hum_pre_15           0.040397  0.

In [19]:
df_res.sort_values(by='abs_sum').head(10).property

17     Prec_pre_7
18      Prec_cont
16    Prec_pre_15
15    Prec_pre_30
14       Hum_cont
9      Wind_pre_7
10      Wind_cont
13      Hum_pre_7
8     Wind_pre_15
7     Wind_pre_30
Name: property, dtype: object