# Landslides

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

In [None]:
train = pd.read_csv('data/Train.csv')

In [None]:
# Show data
train.head()

In [None]:
train.info()

In [None]:
# Check for missing values
train.isna().sum().sum()
# No missing values

In [None]:
# Check for duplicates
train.duplicated().any()
# No duplicates

In [None]:
# Distribution of target value
train['Label'].value_counts(normalize=True)
# unbalanced dataset has to be considered

## Overview plots

In [None]:
def feature_pair_plot(feature, position, title, plotkind):
    """
    Creates a seaborn-pairplot for several positions of the same feature.
    
    feature: name of the feature-column without number and underscore
    position: list of positions of the measured fields, 13 is the central position with a possible landslide
    title: Superior title of the diagram
    plotkind: kind of plot of the comparison plots. Possible values: 'scatter', 'kde', 'hist', 'reg'
    """
    feature_cols=[]
    for i in position:
        feature_cols.append(str(i)+'_'+feature)
    plt.figure()
    sns.pairplot(train[feature_cols], kind=plotkind)
    plt.suptitle(title, y=1.01)
    plt.show()

In [None]:
# Corner, middle and adjescent to middle positions
# 1 6  11 16 21
# 2 7  12 17 22
# 3 8  13 18 23
# 4 9  14 19 24
# 5 10 15 20 25

position = [1, 17, 13, 9, 25]

In [None]:
title='Digital elevation of the terrain surface in meter'
feature_pair_plot(feature='elevation', position=position, title=title, plotkind='hist')

# perfect correlation between different locations

In [None]:
title='Angle of the slope inclination in degree'
feature_pair_plot(feature='slope', position=position, title=title, plotkind='hist')

# Slope varies a lot in these 25 x 25 m² samples
# Positions closer together vary less

In [None]:
title='Exposition of the slope in degree'
feature_pair_plot(feature='aspect', position=position, title=title, plotkind='hist')

# High values in far edges due to circular behavior of the feature. 
# Tansformation of the feature to take this into account is needed.
# Most aspects are close to each other though

In [None]:
title='Planform curvature'
feature_pair_plot(feature='placurv', position=position, title=title, plotkind='hist')

# no correlation between far away positions
# positions closer together are still quite similar

In [None]:
title='Profile curvature'
feature_pair_plot(feature='procurv', position=position, title=title, plotkind='hist')

# no correlation between far away positions
# positions closer together are still quite similar

In [None]:
title='Length-slope factor'
feature_pair_plot(feature='lsfactor', position=position, title=title, plotkind='hist')

# some correlations throughout, but a lot more for close positions

In [None]:
title='Topographic wetness index'
feature_pair_plot(feature='twi', position=position, title=title, plotkind='hist')

# Highly skewed
# Logarithmic transformation should help

In [None]:
title='Lithology of the surface material'
feature_pair_plot(feature='geology', position=position, title=title, plotkind='kde')

# soil material differs in the area
# it is mostly jurassic tuff and lava

In [None]:
title='Step duration orographic intensification factor'
feature_pair_plot(feature='sdoif', position=position, title=title, plotkind='hist')

# perfect correlation all over

## Feature Engineering

In [None]:
# log-transform of columns
def log_transform(df, column):
    for col in df.columns:
        if column in col:
            result = np.where(df[col]>1.0e-5, df[col], 1.0e-5)
            df[col+'_log'] = np.log(result)

In [None]:
log_transform(train, 'twi')

In [None]:
title='Topographic wetness index'
feature_pair_plot(feature='twi_log', position=position, title=title, plotkind='hist')

# no correlation between far away positions
# positions closer together are still quite similar

In [None]:
# sine/cosine of angle-columns
def angle_transform(df, column):
    for col in df.columns:
        if column in col:
            df[col+'_sin'] = np.sin(np.deg2rad(df[col]))
            df[col+'_cos'] = np.cos(np.deg2rad(df[col]))

In [None]:
angle_transform(train, 'aspect')

In [None]:
feature_cols1=[]
feature1='aspect_sin'
for i in position:
    feature_cols1.append(str(i)+'_'+feature1)

feature_cols2=[]
feature2='aspect_cos'
for i in position:
    feature_cols2.append(str(i)+'_'+feature2)

plt.figure()
sns.pairplot(train, x_vars=feature_cols1, y_vars=feature_cols2, kind='hist')
plt.suptitle('aspect cos/sin-trans', y=1.01)
plt.show()

#Perfect correlation will show a circle

## Geology Distribution

In [None]:
geology = pd.DataFrame()
for i in range(25):
    geology[f'{i+1}_geology'] = train[f'{i+1}_geology'].value_counts()
geology.T.describe()

# mostly jurassic tuff and lava
# rarest material is fill
# very little variation between the materials

## Dependence of target on feature distribution

In [None]:
fig, axs = plt.subplots(3, 3, figsize = (13, 10))

sns.histplot(data=train, x="1_elevation", hue="Label", ax=axs[0,0], element="step", fill=False)
sns.histplot(train, x="1_slope", hue="Label", ax=axs[0,1], element="step", fill=False, legend=False)
sns.histplot(train, x="1_aspect", hue="Label", ax=axs[0,2], element="step", fill=False, legend=False)
sns.histplot(train, x="1_placurv", hue="Label", ax=axs[1,0], element="step", fill=False, legend=False)
sns.histplot(train, x="1_procurv", hue="Label", ax=axs[1,1], element="step", fill=False, legend=False)
sns.histplot(train, x="1_lsfactor", hue="Label", ax=axs[1,2], element="step", fill=False, legend=False)
sns.histplot(train, x="1_twi", hue="Label", ax=axs[2,0], element="step", fill=False, legend=False, log_scale=True)
sns.histplot(train, x="1_geology", hue="Label", ax=axs[2,1], multiple="dodge", element="step", fill=False, legend=False)
sns.histplot(train, x="1_sdoif", hue="Label", ax=axs[2,2], element="step", fill=False, legend=False)

fig.tight_layout()
plt.show()

# no clear differentiation between landslides and no landslide possible for any feature

## Correlations

In [None]:
cols = ['1_elevation', '1_slope', '1_aspect', '1_placurv', '1_procurv', 
        '1_lsfactor', '1_twi', '1_geology', '1_sdoif']
sns.pairplot(train[cols])

# cretaceus tuff and lava only in very high areas, fill and sandstone, siltstone and mudstone only in low areas
# little correlation between most features

In [None]:
# Quantify correlations
corr = train[cols].corr()
plt.figure(figsize = (13, 8))
sns.heatmap(corr, cmap='RdYlGn', annot = True, center = 0)
plt.title('Correlogram', fontsize = 15, color = 'darkgreen')
plt.show()

# highest correlation between slope and lsfactor, procurv and placurv, placurv and twi