# Pre-processing statistics

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
import polars as pl
import glob
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
data_dir = '/content/drive/MyDrive/LIAM/Fire_Project/AI_project/data/raw/csv_NASA/modis/instrument_modis_2000-2024'
files = glob.glob(data_dir + '/*.csv')

## Type 0 rate

### Per year

In [None]:
type0_rate = pl.DataFrame({
    'year': pl.Series([], dtype=pl.Int64),
    'type0_rate': pl.Series([], dtype=pl.Float64)
})

for file in files:
    df = pl.read_csv(file, separator=';')
    pourcentage = (df.filter(pl.col('type') == 0).height / df.height) * 100
    year=int(file[-19:][:4])
    type0_rate = type0_rate.vstack(pl.DataFrame({
    'year': [year],
    'type0_rate': [pourcentage]
    }))
pl.Config.set_tbl_rows(25)
type0_rate.sort('year')

year,type0_rate
i64,f64
2000,99.833611
2001,99.965351
2002,99.977501
2003,99.98919
2004,99.986755
2005,99.986838
2006,99.98951
2007,99.992586
2008,99.99273
2009,99.992775


### across all data

In [None]:
type0 = type0_rate['type0_rate'].mean()
print(type0)

99.95480035653023


## 50% and higher confidence rate

### Per year

In [None]:
confidence_rate = pl.DataFrame({
    'year': pl.Series([], dtype=pl.Int64),
    'confidence_rate': pl.Series([], dtype=pl.Float64)
})

for file in files:
    df = pl.read_csv(file, separator=';')
    pourcentage = (df.filter(pl.col('confidence') > 50).height / df.height) * 100
    year=int(file[-19:][:4])
    confidence_rate = confidence_rate.vstack(pl.DataFrame({
    'year': [year],
    'confidence_rate': [pourcentage]
    }))
pl.Config.set_tbl_rows(25)
confidence_rate.sort('year')

year,confidence_rate
i64,f64
2000,75.956739
2001,81.142535
2002,82.138393
2003,81.981314
2004,81.430054
2005,81.421765
2006,82.506993
2007,82.14953
2008,82.037441
2009,81.493469


### across all data

In [None]:
fiftyp = confidence_rate['confidence_rate'].mean()
print(fiftyp)

81.80796062388767


## Correlation Analysis: FRP and brightness

### Per year

In [None]:
correlation = pl.DataFrame({
    'year': pl.Series([], dtype=pl.Int64),
    'correlation': pl.Series([], dtype=pl.Float64)
})

for file in files:
    df = pl.read_csv(file, separator=';')
    corr = df.select(
    pl.corr("brightness", "frp")
    ).item()
    year=int(file[-19:][:4])
    correlation = correlation.vstack(pl.DataFrame({
    'year': [year],
    'correlation': [corr]
    }))
pl.Config.set_tbl_rows(25)
correlation.sort('year')

year,correlation
i64,f64
2000,0.503093
2001,0.551153
2002,0.602616
2003,0.579244
2004,0.591733
2005,0.612958
2006,0.610973
2007,0.576625
2008,0.584302
2009,0.612626


### across all data

In [None]:
corrcoef = correlation['correlation'].mean()
print(corrcoef)

0.5984747205105904
