# Exploratory Data Analysis

In [1]:
import os
import sys

module_path = os.path.abspath(os.path.join(".."))
if module_path not in sys.path:
    sys.path.append(module_path)
    
from utils.data_loader import DataLoader
from utils.calculate_indices import CalculateIndices
from utils.basic_analysis import BasicDataAnalysis


In [2]:
dataloader = DataLoader()
df_base = dataloader.load_transform("../../data/raw_trainset.csv")

## Basic Data Analysis

In [3]:
basicanalysis = BasicDataAnalysis(df_base)
basicanalysis.get_dtypes()

time                datetime64[ns]
id                           int64
disturbance_year             int64
doy                          int64
b2                           int64
b3                           int64
b4                           int64
b5                           int64
b6                           int64
b7                           int64
b8                           int64
b8a                          int64
b11                          int64
b12                          int64
species                     object
dtype: object

In [4]:
print(f"Raw Dataset cols:{basicanalysis.get_num_cols()}")
print(f"Raw Dataset rows:{basicanalysis.get_num_rows()}")

Raw Dataset cols:15
Raw Dataset rows:4074354


In [5]:
basicanalysis.get_desricption()

Unnamed: 0,time,id,disturbance_year,doy,b2,b3,b4,b5,b6,b7,b8,b8a,b11,b12
count,4074354,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0,4074354.0
mean,2020-02-05 05:16:36.360551936,14086.75,252.2988,176.3385,272.6073,430.9495,346.5769,676.4691,1683.092,2040.237,2166.979,2290.828,1158.527,601.5298
min,2017-01-01 00:00:00,1.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,2018-09-24 00:00:00,7038.0,0.0,106.0,156.0,281.0,175.0,460.0,1268.0,1529.0,1629.0,1741.0,730.0,336.0
50%,2020-03-15 00:00:00,14063.0,0.0,176.0,223.0,371.0,257.0,588.0,1547.0,1856.0,1971.0,2092.0,1011.0,481.0
75%,2021-06-16 00:00:00,21133.0,0.0,245.0,322.0,503.0,399.0,784.0,1924.0,2327.0,2464.0,2610.0,1441.0,709.0
max,2022-11-02 00:00:00,28212.0,2020.0,366.0,20087.0,17534.0,17524.0,17940.0,19827.0,21439.0,21811.0,24694.0,18554.0,17587.0
std,,8145.028,667.5001,85.21531,220.2539,260.2215,308.6128,348.2985,645.7464,802.4451,853.2377,859.7645,609.2414,419.6677


In [6]:
basicanalysis.get_missing_counts()

time                0
id                  0
disturbance_year    0
doy                 0
b2                  0
b3                  0
b4                  0
b5                  0
b6                  0
b7                  0
b8                  0
b8a                 0
b11                 0
b12                 0
species             0
dtype: int64

## Feature Engineering

### Datetime Feature Engineering

In [7]:
df = dataloader.date_feature_extraction(df_base)

### Advanced Feature Engineering

In [8]:
calcindices = CalculateIndices()
df = calcindices.add_all_indices(df)

In [9]:
df

Unnamed: 0,time,id,disturbance_year,doy,b2,b3,b4,b5,b6,b7,...,ndvi,gndvi,wdvi,tndvi,savi,ipvi,mcari,reip,masvi2,dvi
808190,2017-03-13,1,0,72,147,204,145,400,1193,1435,...,0.833142,0.751068,1520.5,1.154617,1.249353,0.916571,562.758621,-32.156368,-0.999372,1448
808191,2017-03-27,1,0,86,137,222,154,389,1247,1561,...,0.819988,0.750981,1480.0,1.148907,1.229623,0.909994,474.883117,-32.434732,-0.999358,1403
808192,2017-04-09,1,0,99,160,248,169,395,1209,1454,...,0.801059,0.708578,1445.5,1.140640,1.201236,0.900530,422.579882,-31.197789,-0.999347,1361
808193,2017-04-22,1,0,112,146,247,152,391,1354,1628,...,0.833333,0.736533,1596.0,1.154701,1.249657,0.916667,491.836842,-30.333333,-0.999402,1520
808194,2017-04-29,1,0,119,171,264,186,419,1353,1731,...,0.813253,0.735338,1713.0,1.145973,1.219573,0.906627,419.901075,-32.782655,-0.999446,1620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3311855,2022-09-05,28212,0,248,318,450,338,619,1664,2012,...,0.740897,0.634444,2102.0,1.113956,1.111132,0.870448,411.689941,-31.960766,-0.999560,1933
3311856,2022-09-24,28212,0,267,202,334,229,513,1666,2136,...,0.815024,0.729555,2132.5,1.146745,1.222290,0.907512,508.967686,-32.916739,-0.999555,2018
3311857,2022-10-03,28212,0,276,184,311,205,504,1580,1910,...,0.813382,0.719946,1889.5,1.146029,1.219795,0.906691,588.081951,-31.509294,-0.999498,1787
3311858,2022-10-27,28212,0,300,353,560,362,732,1944,2254,...,0.749654,0.601990,2349.0,1.117879,1.124287,0.874827,598.541436,-31.247525,-0.999605,2168
