In [None]:
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
mdata = pd.read_excel("C:/Users/Lenovo/Desktop/landslide.xlsx")

In [None]:
from sklearn.preprocessing import MinMaxScaler

y = mdata['S/N'].values
y = y.astype(int)
x_data = mdata.drop(["S/N"], axis=1)
scaler = MinMaxScaler()
x = pd.DataFrame(scaler.fit_transform(x_data), columns=x_data.columns)
x

In [None]:
import seaborn as sns

s_n_1_data = mdata[mdata['S/N'] == 1].drop(["S/N"], axis=1)
corr_matrix = s_n_1_data.corr()

plt.figure(figsize=(12, 8))

sns.heatmap(
    corr_matrix,  
    annot=True,   
    fmt=".2f",    
    cmap="coolwarm", 
    vmin=-0.5,      
    vmax=1,       
    linewidths=0.5,  
    linecolor="white"
)

plt.show()

In [None]:
import seaborn as sns

corr_matrix = x.corr()
plt.figure(figsize=(12, 8))

sns.heatmap(
    corr_matrix,   
    annot=True,    
    fmt=".2f",     
    cmap="coolwarm",
    vmin=-0.5,      
    vmax=1,        
    linewidths=0.5, 
    linecolor="white"
)

plt.show()

In [None]:
from sklearn.cluster import DBSCAN

dbscan = DBSCAN(eps=0.5, min_samples=5)
labels = dbscan.fit_predict(x)

cluster_count = len(set(labels) - {-1})
print("c_count:", cluster_count)

In [None]:
import numpy as np
from sklearn.mixture import GaussianMixture

cnum = cluster_count

gmm = GaussianMixture(n_components=cnum, random_state=42)
gmm.fit(x)

clusters = gmm.predict(x)

c = gmm.means_  

cluster_minority_count = {i: 0 for i in range(cnum)}
cluster_total_count = {i: 0 for i in range(cnum)}

for i in range(len(x)):
    cluster_label = clusters[i]
    cluster_total_count[cluster_label] += 1

    if y[i] == 1: 
        cluster_minority_count[cluster_label] += 1

cluster_ir = {}
for cluster in range(cnum):
    majority_count = cluster_total_count[cluster] - cluster_minority_count[cluster]
    minority_count = cluster_minority_count[cluster]
    if majority_count > 0:
        ir = minority_count / majority_count
        cluster_ir[cluster] = ir
    else:
        cluster_ir[cluster] = float('inf')

for cluster, ir in cluster_ir.items():
    print(f"C {cluster + 1} IR: {ir:.4f}")


In [None]:
from config import α
flag = 0  
max = 0  
max_ir = 0

for cluster, ir in cluster_ir.items():
    if ir > α:
        flag = 1
    if ir > max:
        max_ir = cluster
        max = ir

if flag == 1:
    selected_clusters = [cluster for cluster, ir in cluster_ir.items() if ir > α]
else:
    selected_clusters = [max_ir]

unselected_clusters = [cluster for cluster in range(cnum) if cluster not in selected_clusters]

print("\n IR > α:")
print(selected_clusters)

print("\n IR <= α:")
print(unselected_clusters)


In [None]:
unselected_minority_samples = []

for cluster in unselected_clusters:
    
    minority_indices = [i for i in range(len(y)) if clusters[i] == cluster and y[i] == 1]
    unselected_minority_samples.extend(minority_indices)

In [None]:
from imblearn.over_sampling import SMOTE

counts = np.bincount(y)
num = counts[0] - counts[1]


majority_samples = [i for i in range(len(y)) if y[i] == 0]
X_majority = x.iloc[majority_samples].values
y_majority = y[majority_samples]

cluster_minority_counts = {}
for cluster in selected_clusters:
    minority_samples = [i for i in range(len(y)) if clusters[i] == cluster and y[i] == 1]
    cluster_minority_counts[cluster] = len(minority_samples)

total_minority_samples = sum(cluster_minority_counts.values())

new_samples = []
for cluster in selected_clusters:
    minority_samples = [i for i in range(len(y)) if clusters[i] == cluster and y[i] == 1]
    X_minority = x.iloc[minority_samples].values
    y_minority = y[minority_samples]

    X_combined = np.vstack([X_minority, X_majority])
    y_combined = np.concatenate([y_minority, y_majority])
    
    cluster_ratio = cluster_minority_counts[cluster] / total_minority_samples
    num_samples_to_generate = int(num * cluster_ratio)

    original_indices = np.arange(len(X_combined))

    smote = SMOTE(random_state=42, sampling_strategy={1: num_samples_to_generate + len(minority_samples)})
    X_resampled, y_resampled = smote.fit_resample(X_combined, y_combined)

    new_indices = np.setdiff1d(np.arange(len(X_resampled)), original_indices)
    new_synthetic_samples = X_resampled[new_indices]

    new_samples.append(new_synthetic_samples)

new_samples = np.vstack(new_samples)

for new_sample in new_samples:
    print(new_sample)
print(len(new_samples))

In [None]:
new_sample = new_sample.T 
new_columns = ['dem', 'slope', 'aspect', 'curvature', 'dis_river', 'dis_road', 'dis_fault', 'ndvi', 'ndwi', 'rainfall', 'lithology']
new_samples_df = pd.DataFrame(new_samples, columns=new_columns) 

X_balanced = pd.concat([x, new_samples_df], ignore_index=True)

X_balanced

In [None]:
from collections import Counter

new_labels = np.ones(num, dtype=int)  
y_balanced = np.concatenate([y, new_labels])

print(Counter(y))
print(Counter(y_balanced))

In [None]:
x_smo = X_balanced
y_smo = y_balanced

In [None]:
from sklearn.manifold import TSNE

mdata = pd.concat([x, pd.DataFrame(y, columns=['S/N'])], axis=1)
x_0 = mdata[mdata['S/N'] == 0].drop(columns="S/N")
x_1 = mdata[mdata['S/N'] == 1].drop(columns="S/N")
print(len(x_1))

mdata1 = pd.concat([x_smo, pd.DataFrame(y_smo, columns=['S/N'])], axis=1)
x_0_smo = mdata1[mdata1['S/N'] == 0].drop(columns="S/N")
x_1_smo = mdata1[mdata1['S/N'] == 1].drop(columns="S/N")

tsne = TSNE(n_components=2, random_state=42, perplexity=30)  
X_reduced_0_smo = tsne.fit_transform(x_0_smo)
X_reduced_1_smo = tsne.fit_transform(x_1_smo)
print(len(X_reduced_1_smo))

plt.figure(figsize=(18, 6))
plt.subplot(1, 2, 1)
plt.scatter(X_reduced_0_smo[:, 0], X_reduced_0_smo[:, 1], c='red', label='Non-Landslide', alpha=0.6)
plt.scatter(X_reduced_1_smo[:len(x_1), 0], X_reduced_1_smo[:len(x_1), 1], c='blue', label='Landslide (Original)', alpha=0.6)

plt.legend()

mdata1 = pd.concat([x_smo, pd.DataFrame(y_smo, columns=['S/N'])], axis=1)
x_0_smo = mdata1[mdata1['S/N'] == 0].drop(columns="S/N")
x_1_smo = mdata1[mdata1['S/N'] == 1].drop(columns="S/N")

tsne = TSNE(n_components=2, random_state=42, perplexity=30) 
X_reduced_0_smo = tsne.fit_transform(x_0_smo)
X_reduced_1_smo = tsne.fit_transform(x_1_smo)

plt.subplot(1, 2, 2)
plt.scatter(X_reduced_0_smo[:, 0], X_reduced_0_smo[:, 1], c='red', label='Non-Landslide', alpha=0.6)
plt.scatter(X_reduced_1_smo[:len(x_1), 0], X_reduced_1_smo[:len(x_1), 1], c='blue', label='Landslide (Original)', alpha=0.6)
plt.scatter(X_reduced_1_smo[len(x_1):, 0], X_reduced_1_smo[len(x_1):, 1], c='limegreen', marker='^', label='Landslide (Synthetic)', alpha=0.6)

plt.legend()

plt.tight_layout()
plt.show()

In [None]:
from sklearn.decomposition import PCA

mdata = pd.concat([x,pd.DataFrame(y,columns=['S/N'])],axis=1)
x_0 = mdata[mdata['S/N'] == 0].drop(columns="S/N")
x_1 = mdata[mdata['S/N'] == 1].drop(columns="S/N")
x_original = x_1

pca = PCA(n_components=2)
X_reduced_0 = pca.fit_transform(x_0)
X_reduced_1 = pca.fit_transform(x_1)

plt.figure(figsize=(18, 6))
plt.subplot(1, 2, 1)

plt.scatter(X_reduced_0[:, 0], X_reduced_0[:, 1], c='red', label='Non-Landslide', alpha=0.6)

plt.scatter(X_reduced_1[:, 0], X_reduced_1[:, 1], c='blue', label='Landslide', alpha=0.6)

plt.legend()

mdata1 = pd.concat([x_smo, pd.DataFrame(y_smo, columns=['S/N'])], axis=1)

x_0 = mdata1[mdata1['S/N'] == 0].drop(columns="S/N")
x_1 = mdata1[mdata1['S/N'] == 1].drop(columns="S/N")

pca = PCA(n_components=2)
X_reduced_0 = pca.fit_transform(x_0)
X_reduced_1 = pca.fit_transform(x_1)

plt.subplot(1, 2, 2)
plt.scatter(X_reduced_0[:, 0], X_reduced_0[:, 1], c='red', label='Non-Landslide', alpha=0.6)

plt.scatter(X_reduced_1[:len(x_original), 0], X_reduced_1[:len(x_original), 1], c='blue', label='Landslide(Original)', alpha=0.6)
plt.scatter(X_reduced_1[len(x_original):, 0], X_reduced_1[len(x_original):, 1],
            c='limegreen', marker='^', label='Landslide (Synthetic)', alpha=0.6)


plt.legend(loc='upper right')

plt.show()

# Verification

In [None]:
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import cross_val_score,KFold

kf = RepeatedKFold(n_splits=10, n_repeats=5, random_state=0)
kf2 = KFold(n_splits=10, shuffle=True, random_state=0)
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from statistics import mean

In [None]:
#Logistic 
print("---------------Logistic---------------")
lr = LogisticRegression()
acc_lr = cross_val_score(lr, x_smo, y_smo, cv=kf, scoring='accuracy')
recall = cross_val_score(lr, x_smo, y_smo, cv=kf, scoring='recall')
f1 = cross_val_score(lr, x_smo, y_smo, cv=kf, scoring='f1')
auc_lr = cross_val_score(lr, x_smo, y_smo, cv=kf, scoring='roc_auc')
scores = cross_val_score(lr, x_smo, y_smo, cv=kf)
y_pred_prob = cross_val_predict(lr, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)
print('Log Loss: %.4f' % log_loss_score)
print('Mean Accuracy: %.4f' % mean(acc_lr))
print('Mean Recall: %.4f' % mean(recall))
print('Mean F1-score: %.4f' % mean(f1))
print('Mean AUC: %.4f' % mean(auc_lr))
print('Mean scores: %.4f' % mean(scores))

#Support Vector Machines(SVM) 
print("---------------SVM---------------")
svm = SVC(random_state=42, probability=True)
acc_svm = cross_val_score(svm, x_smo, y_smo, cv=kf, scoring='accuracy')
recall = cross_val_score(svm, x_smo, y_smo, cv=kf, scoring='recall')
f1 = cross_val_score(svm, x_smo, y_smo, cv=kf, scoring='f1')
auc_svm = cross_val_score(svm, x_smo, y_smo, cv=kf, scoring='roc_auc')
scores = cross_val_score(svm, x_smo, y_smo, cv=kf)
y_pred_prob = cross_val_predict(svm, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)
print('Log Loss: %.4f' % log_loss_score)
print('Mean Accuracy: %.4f' % mean(acc_svm))
print('Mean Recall: %.4f' % mean(recall))
print('Mean F1-score: %.4f' % mean(f1))
print('Mean AUC: %.4f' % mean(auc_svm))
print('Mean scores: %.4f' % mean(scores))

#Naive Bayes(NB)
print("---------------NB---------------")
nb = GaussianNB()
acc_nb = cross_val_score(nb, x_smo, y_smo, cv=kf, scoring='accuracy')
recall = cross_val_score(nb, x_smo, y_smo, cv=kf, scoring='recall')
f1 = cross_val_score(nb, x_smo, y_smo, cv=kf, scoring='f1')
auc_nb = cross_val_score(nb, x_smo, y_smo, cv=kf, scoring='roc_auc')
scores = cross_val_score(nb, x_smo, y_smo, cv=kf)
y_pred_prob = cross_val_predict(nb, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)
print('Log Loss: %.4f' % log_loss_score)
print('Mean Accuracy: %.4f' % mean(acc_nb))
print('Mean Recall: %.4f' % mean(recall))
print('Mean F1-score: %.4f' % mean(f1))
print('Mean AUC: %.4f' % mean(auc_nb))
print('Mean scores: %.4f' % mean(scores))

#Decision Tree(DT)
print("---------------DT---------------")
df = DecisionTreeClassifier(random_state=42)
acc_dt = cross_val_score(df, x_smo, y_smo, cv=kf, scoring='accuracy')
recall = cross_val_score(df, x_smo, y_smo, cv=kf, scoring='recall')
f1 = cross_val_score(df, x_smo, y_smo, cv=kf, scoring='f1')
auc_dt = cross_val_score(df, x_smo, y_smo, cv=kf, scoring='roc_auc')
scores = cross_val_score(df, x_smo, y_smo, cv=kf)
y_pred_prob = cross_val_predict(df, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)
print('Log Loss: %.4f' % log_loss_score)
print('Mean Accuracy: %.4f' % mean(acc_dt))
print('Mean Recall: %.4f' % mean(recall))
print('Mean F1-score: %.4f' % mean(f1))
print('Mean AUC: %.4f' % mean(auc_dt))
print('Mean scores: %.4f' % mean(scores))

#Random Forest(RF)
print("---------------RF---------------")
rf = RandomForestClassifier(n_estimators=100, random_state=42)
acc_rf = cross_val_score(rf, x_smo, y_smo, cv=kf, scoring='accuracy')
recall = cross_val_score(rf, x_smo, y_smo, cv=kf, scoring='recall')
f1 = cross_val_score(rf, x_smo, y_smo, cv=kf, scoring='f1')
auc_rf = cross_val_score(rf, x_smo, y_smo, cv=kf, scoring='roc_auc')
scores = cross_val_score(rf, x_smo, y_smo, cv=kf)
y_pred_prob = cross_val_predict(rf, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)
print('Log Loss: %.4f' % log_loss_score)
print('Mean Accuracy: %.4f' % mean(acc_rf))
print('Mean Recall: %.4f' % mean(recall))
print('Mean F1-score: %.4f' % mean(f1))
print('Mean AUC: %.4f' % mean(auc_rf))
print('Mean scores: %.4f' % mean(scores))


In [None]:
import xgboost as xgb
from config import xgb_params

params = xgb_params
model = xgb.XGBClassifier(**params)
acc_xgb = cross_val_score(model, x_smo, y_smo, scoring='accuracy', cv=kf)
recall = cross_val_score(model, x_smo, y_smo, scoring='recall', cv=kf)
f1 = cross_val_score(model, x_smo, y_smo, scoring='f1', cv=kf)
auc_xgb = cross_val_score(model, x_smo, y_smo, scoring='roc_auc', cv=kf)

print(f"Mean Accuracy: {acc_xgb.mean():.4f}")
print(f"Mean Recall: {recall.mean():.4f}")
print(f"Mean F1-score: {f1.mean():.4f}")
print(f"Mean AUC: {auc_xgb.mean():.4f}")

In [None]:
from catboost import CatBoostClassifier
from config import cat_params
params = cat_params

model = CatBoostClassifier(**params)

acc_cat = cross_val_score(model, x_smo, y_smo, cv=kf, n_jobs=-1, scoring='accuracy')
recall = cross_val_score(model, x_smo, y_smo, scoring='recall', cv=kf, verbose=0, )
f1 = cross_val_score(model, x_smo, y_smo, scoring='f1', cv=kf, verbose=0, )
auc_cat = cross_val_score(model, x_smo, y_smo, scoring='roc_auc', cv=kf, verbose=0, )

print("---------------CatBoost---------------")
print(f"Mean Accuracy: {acc_cat.mean():.4f}")
print(f"Mean Recall: {recall.mean():.4f}")
print(f"Mean F1-score: {f1.mean():.4f}")
print(f"Mean AUC: {auc_cat.mean():.4f}")

In [None]:
import lightgbm as lgb
from config import lgb_params
#LightGBM
params = lgb_params

model = lgb.LGBMClassifier(**params)

acc_lgb = cross_val_score(model, x_smo, y_smo, scoring='accuracy', cv=kf)
recall = cross_val_score(model, x_smo, y_smo, scoring='recall', cv=kf)
f1 = cross_val_score(model, x_smo, y_smo, scoring='f1', cv=kf)
auc_lgb = cross_val_score(model, x_smo, y_smo, scoring='roc_auc', cv=kf)

print("---------------LightGBM---------------")
print(f"Mean Accuracy: {acc_lgb.mean():.4f}")
print(f"Mean Recall: {recall.mean():.4f}")
print(f"Mean F1-score: {f1.mean():.4f}")
print(f"Mean AUC: {auc_lgb.mean():.4f}")

# Hyperparameter optimization

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x_smo, y_smo, test_size=0.3, random_state=420)

In [None]:
from config import opt_xgb_param
import optuna
from sklearn.metrics import accuracy_score, recall_score, f1_score, roc_auc_score
import xgboost as xgb


def objective(trial):
    param = opt_xgb_param

    model = xgb.XGBClassifier(**param)
    model.fit(X_train, y_train)
   
    y_pred = model.predict(X_test)
   
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy 


study = optuna.create_study(direction='maximize')

study.optimize(objective, n_trials=400)

best_params = study.best_params
print('best_params:', best_params)
print('best_value:', study.best_value)

model = xgb.XGBClassifier(**best_params)


acc_op_xgb = cross_val_score(model, x_smo, y_smo, scoring='accuracy', cv=kf)
recall = cross_val_score(model, x_smo, y_smo, scoring='recall', cv=kf)
f1 = cross_val_score(model, x_smo, y_smo, scoring='f1', cv=kf)
auc_op_xgb = cross_val_score(model, x_smo, y_smo, scoring='roc_auc', cv=kf)
y_pred_prob = cross_val_predict(model, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)
print(f"Log Loss: {log_loss_score:.4f}")

print("---------------Optuna-XGBoost---------------")
print(f"Accuracy: {acc_op_xgb.mean():.4f}")
print(f"Recall: {recall.mean():.4f}")
print(f"F1-score: {f1.mean():.4f}")
print(f"AUC: {auc_op_xgb.mean():.4f}")

In [None]:
from config import opt_cat_param
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
import optuna.visualization as vis
from sklearn.model_selection import cross_val_score

def objective(trial):
    param = opt_cat_param

    model = CatBoostClassifier(**param)

 
    model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100, verbose=False)

    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)

    return accuracy 


study = optuna.create_study(direction='maximize')  
study.optimize(objective, n_trials=400)  
best_params = study.best_params

model = CatBoostClassifier(**best_params)

acc_op_cat = cross_val_score(model, x_smo, y_smo, scoring='accuracy', cv=kf)
recall = cross_val_score(model, x_smo, y_smo, scoring='recall', cv=kf)
f1 = cross_val_score(model, x_smo, y_smo, scoring='f1', cv=kf)
auc_op_cat = cross_val_score(model, x_smo, y_smo, scoring='roc_auc', cv=kf)
y_pred_prob = cross_val_predict(model, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)

print('best_params:', best_params)
print('best_value:', study.best_value)
print(f"Log Loss: {log_loss_score:.4f}")

print("---------------Optuna-CatBoost---------------")
print(f"Accuracy: {acc_op_cat.mean():.4f}")
print(f"Recall: {recall.mean():.4f}")
print(f"F1-score: {f1.mean():.4f}")
print(f"AUC: {auc_op_cat.mean():.4f}")
    
fig_importance = vis.plot_param_importances(study)
fig_importance.update_layout(width=800, height=500)
fig_importance.show()

importance_data = study.trials_dataframe()[['params_' + param for param in study.best_params.keys()]]
importance_data.columns = [col.replace('params_', '') for col in importance_data.columns]
param_importance = optuna.importance.get_param_importances(study)
top_params = list(param_importance.keys())[:3] 
    
fig_history = vis.plot_optimization_history(study)
fig_history.update_layout(width=800, height=400)
fig_history.show()
    
fig_slice = vis.plot_slice(study, params=top_params)
fig_slice.update_layout(width=1200, height=500)
fig_slice.show()

In [None]:
from config import opt_lgb_param
import optuna
import lightgbm as lgb
from sklearn.metrics import accuracy_score, log_loss
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold

def objective(trial):
    param = opt_lgb_param

    dtrain = lgb.Dataset(X_train, label=y_train)
    dtest = lgb.Dataset(X_test, label=y_test)

    model = lgb.train(param, dtrain, valid_sets=[dtest], num_boost_round=500, 
                      callbacks=[lgb.early_stopping(100, verbose=False)]) 

    y_pred = model.predict(X_test)
    y_pred_binary = [1 if p >= 0.5 else 0 for p in y_pred]  # 阈值为0.5
    accuracy = accuracy_score(y_test, y_pred_binary)

    return accuracy  

study = optuna.create_study(direction='maximize')  
study.optimize(objective, n_trials=400, show_progress_bar=False) 

best_params = study.best_params
print('best_params:', best_params)
print('best_value:', study.best_value)

model = lgb.LGBMClassifier(**best_params, verbosity=-1) 

acc_op_lgb = cross_val_score(model, x_smo, y_smo, scoring='accuracy', cv=kf, n_jobs=-1)
recall = cross_val_score(model, x_smo, y_smo, scoring='recall', cv=kf, n_jobs=-1)
f1 = cross_val_score(model, x_smo, y_smo, scoring='f1', cv=kf, n_jobs=-1)
auc_op_lgb = cross_val_score(model, x_smo, y_smo, scoring='roc_auc', cv=kf, n_jobs=-1)
y_pred_prob = cross_val_predict(model, x_smo, y_smo, cv=kf2, method='predict_proba')
log_loss_score = log_loss(y_smo, y_pred_prob)

print(f"Log Loss: {log_loss_score:.4f}")
print("---------------Optuna-LightGBM---------------")
print(f"Accuracy: {acc_op_lgb.mean():.4f}")
print(f"Recall: {recall.mean():.4f}")
print(f"F1-score: {f1.mean():.4f}")
print(f"AUC: {auc_op_lgb.mean():.4f}")