In [4]:
import geopandas as gpd
import glob
import pandas as pd
import rasterio
from rasterio.mask import mask
import sys
sys.path.insert(0, './satellite-images-nso-datascience/src/nso_ds_classes/')
from nso_ds_normalize_scaler import scaler_class_all
from statsmodels.stats.outliers_influence import variance_inflation_factor

## Voornes Duin

### Prepare data

In [5]:
def extract_dataframe_pixels_values_from_tif_and_polygons(path_to_tif, path_to_polygons):
        geo_file = gpd.read_file(path_to_polygons)
        src = rasterio.open(path_to_tif)
        df = pd.DataFrame([])   
        name_tif = path_to_tif.split("/")[-1].split('.')[0]
        
        for index, row in geo_file.iterrows():
                if geo_file.crs != 'epsg:28992':
                        geo_file = geo_file.to_crs(epsg=28992)

                if row['name'] == name_tif:
                        out_image, out_transform = mask(src,row['geometry'], crop=True)
                        out_meta = src.profile.copy()
                        out_meta.update({
                                        "height": out_image.shape[1],
                                        "width": out_image.shape[2],
                                        "transform": out_transform})
                        
                        df_row = pd.DataFrame([band.flatten() for band in out_image]).transpose()
                        df_row.columns = ["r","g","b","i","ndvi","height"]
                        df_row['label'] = row["Label"]
                        df_row['image'] = path_to_tif.split("/")[-1]
                        df = df.append(df_row)
                        
        src.close()
        df = df[df['r'] != 0].reset_index().drop(['index'],axis=1)
        print(len(df))

        return df

In [6]:
def __get_season_for_month(month):
    """
        This method get the season for a specific month for a number of a month.

        @param month: A month in number
        @return the season in string format, and the season in string format.
    """
    
    season = int(month)%12 // 3 + 1
    season_str = ""
    if season == 1:
        season_str = "Winter"
    if season == 2:
        season_str = "Spring"
    if season == 3:
        season_str = "Summer"
    if season == 4 :
        season_str = "Fall"
 
    return season_str

In [None]:
df = pd.DataFrame([])
for file in glob.glob("./data/Voornes Duin/*/*ndvi_height.tif"): #path to the satellite image
    df_numbers = pd.DataFrame([])
    path_to_tif = file.replace("\\","/")
    print(file)
    path_to_polygons = './data/annotations/annotaties_VoornesDuin_polygoon.geojson' #path to the annotations file

    df = df.append(extract_dataframe_pixels_values_from_tif_and_polygons(path_to_tif, path_to_polygons))
    df.reset_index().drop(['index'],axis=1)

In [None]:
df.groupby("label").mean()

In [9]:
df['label'] =df['label'].str.replace("zand","Zand")
df['label'] =df['label'].str.replace("struweel","Struweel")
df['label'] =df['label'].str.replace("bos","Bos")
df['label'] =df['label'].str.replace("duinvallei","Vochtige duinvallei")
df['label'] =df['label'].str.replace("laag gras","Laag gras")
df['label'] =df['label'].str.replace("water","Water")
df['label'] =df['label'].str.replace("struwee","Struweel")
df['label'] =df['label'].str.replace("duin vallei","Vochtige duinvallei")
df['label'] =df['label'].str.replace("laag vegetatie","Laag gras")
df['label'] =df['label'].str.replace("lssg gras","Laag gras")
df['label'] =df['label'].str.replace("laag vegatatie","Laag gras")

In [None]:
df.groupby("label").mean()

In [11]:
df['date'] = df['image'].str[0:15]

In [12]:
df['date'] = df['date'].astype(str)

In [None]:
df['season'] = df['image'].str[4:6].apply(__get_season_for_month)
df

In [15]:
#save dataframe
df.to_pickle("VoornesDuin_polyg2pixel_new.pkl")

### Normalise dataframe

In [None]:
df = pd.read_pickle("VoornesDuin_polyg2pixel_new.pkl")
df

In [16]:
df_scaled = df.copy()

In [None]:
for date in df_scaled['date'].unique():
    print(date)
    
    ahn_type = "./scalers/ahn4.save"

    a_normalize_scaler_class_all = scaler_class_all(scaler_file_band1 = glob.glob("./scalers/"+date.split(" ")[0]+"*band1*")[0].replace("\\","/"), \
                                                    scaler_file_band2 = glob.glob("./scalers/"+date.split(" ")[0]+"*band2*")[0].replace("\\","/"), \
                                                    scaler_file_band3 = glob.glob("./scalers/"+date.split(" ")[0]+"*band3*")[0].replace("\\","/"), \
                                                    scaler_file_band4 = glob.glob("./scalers/"+date.split(" ")[0]+"*band4*")[0].replace("\\","/"), \
                                                    scaler_file_band5 = glob.glob("./scalers/"+date.split(" ")[0]+"*band5*")[0].replace("\\","/"), \
                                                    scaler_file_band6 = ahn_type)
    
    df_scaled[df_scaled['date'] == date] = a_normalize_scaler_class_all.transform(df_scaled[df_scaled['date'] == date], col_names=["r","g","b","i",'ndvi','height'])

In [None]:
df_scaled.groupby("label").median()

In [22]:
#save scaled dataframe
df_scaled.to_pickle("VoornesDuin_polyg2pixel_scaled_new.pkl")

### Check multicollinearity between independent variables

In [None]:
df_scaled = pd.read_pickle("VoornesDuin_polyg2pixel_scaled_new.pkl")
df_scaled


In [None]:
#using correlation coeficient
df_scaled[['r','g','b','i','ndvi','height']].corr()

In [None]:
#correlation with dependent variable
data_dummy = pd.get_dummies(df_scaled[['r','g','b','i','ndvi','height','label']], prefix=None, prefix_sep='_', dummy_na=False, columns=None, 
sparse=False, drop_first=False, dtype=None)
data_dummy

In [None]:
data_dummy.corr()#method='pearson'(default)

In [None]:
#using variance inflation factor
X = df_scaled[['r','g','b','i','ndvi','height']]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
#test without green
X = df_scaled[['r','b','i','ndvi','height']]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
#test without red
X = df_scaled[['b','i','ndvi','height']]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)

In [None]:
#test without infra-red (best results)
X = df_scaled[['b','ndvi','height']]
  
# VIF dataframe
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns
  
# calculating VIF for each feature
vif_data["VIF"] = [variance_inflation_factor(X.values, i)
                          for i in range(len(X.columns))]
  
print(vif_data)