In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [23]:
df=pd.read_csv('./data/regenerated_landslide_risk_dataset.csv')


In [24]:
df.head()

Unnamed: 0,Temperature (°C),Humidity (%),Precipitation (mm),Soil Moisture (%),Elevation (m),Landslide Risk Prediction
0,17,68,176,64,120,Low
1,26,33,65,24,672,Low
2,16,81,56,52,36,Low
3,25,53,136,70,583,Low
4,34,77,23,37,130,Low


In [25]:
df.columns

Index(['Temperature (°C)', 'Humidity (%)', 'Precipitation (mm)',
       'Soil Moisture (%)', 'Elevation (m)', 'Landslide Risk Prediction'],
      dtype='object')

In [26]:
df['Landslide Risk Prediction'].value_counts()

Landslide Risk Prediction
Low          4591
Moderate      334
High           63
Very High      12
Name: count, dtype: int64

In [27]:
df['Landslide Risk Prediction'] = df['Landslide Risk Prediction'].apply(lambda x: 0 if x == 'Low' else 1)

In [28]:
df['Landslide Risk Prediction'].value_counts()

Landslide Risk Prediction
0    4591
1     409
Name: count, dtype: int64

In [29]:
df.to_csv("final_dataset.csv", index=False)

In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Assuming df is loaded from previous script
# Replace with your data loading if running separately

# Set up plot style
plt.style.use('ggplot')
sns.set(font_scale=1.2)

# Class distribution visualization
plt.figure(figsize=(10, 6))
sns.countplot(x='Landslide Risk Prediction', data=df)
plt.title('Class Distribution (Imbalance Visualization)')
plt.savefig('./images/class_imbalance.png')
plt.close()

# Distribution of features by class
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.flatten()

for i, feature in enumerate(df.columns[:-1]):
    sns.boxplot(x='Landslide Risk Prediction', y=feature, data=df, ax=axes[i])
    axes[i].set_title(f'{feature} by Landslide Risk')

plt.tight_layout()
plt.savefig('./images/feature_distribution_by_class.png')
plt.close()

# Correlation heatmap
plt.figure(figsize=(12, 10))
correlation = df.corr()
mask = np.triu(correlation)
sns.heatmap(correlation, annot=True, cmap='coolwarm', mask=mask, vmin=-1, vmax=1)
plt.title('Feature Correlation Matrix')
plt.tight_layout()
plt.savefig('./images/correlation_heatmap.png')
plt.close()

# Feature distributions
fig, axes = plt.subplots(2, 3, figsize=(20, 12))
axes = axes.flatten()

for i, feature in enumerate(df.columns[:-1]):
    sns.histplot(df[feature], kde=True, ax=axes[i])
    axes[i].set_title(f'Distribution of {feature}')

plt.tight_layout()
plt.savefig('./images/feature_distributions.png')
plt.close()

# Pairplot for feature relationships
plt.figure(figsize=(15, 15))
sns.pairplot(df, hue='Landslide Risk Prediction', corner=True)
plt.savefig('./images/pairplot.png')
plt.close()


<Figure size 1500x1500 with 0 Axes>

In [31]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from scipy import stats

# Assuming df is loaded from previous scripts
# Replace with your data loading if running separately

# Feature scaling for PCA
X = df.drop('Landslide Risk Prediction', axis=1)
y = df['Landslide Risk Prediction']

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# PCA for dimensionality reduction visualization
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

pca_df = pd.DataFrame({
    'PC1': X_pca[:, 0],
    'PC2': X_pca[:, 1],
    'Landslide Risk': y
})

plt.figure(figsize=(10, 8))
sns.scatterplot(x='PC1', y='PC2', hue='Landslide Risk', 
                data=pca_df, palette=['skyblue', 'salmon'], s=100)
plt.title('PCA: First Two Principal Components')
plt.savefig('pca_visualization.png')
plt.close()

# Statistical tests to compare features between classes
print("Feature differences between classes:")
for feature in X.columns:
    class0 = df[df['Landslide Risk Prediction'] == 0][feature]
    class1 = df[df['Landslide Risk Prediction'] == 1][feature]
    
    t_stat, p_val = stats.ttest_ind(class0, class1, equal_var=False)
    
    print(f"{feature}: p-value = {p_val:.4f} {'(significant)' if p_val < 0.05 else ''}")


Feature differences between classes:
Temperature (°C): p-value = 0.7000 
Humidity (%): p-value = 0.0000 (significant)
Precipitation (mm): p-value = 0.0000 (significant)
Soil Moisture (%): p-value = 0.0000 (significant)
Elevation (m): p-value = 0.0000 (significant)


In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import classification_report, confusion_matrix, roc_curve, auc, precision_recall_curve, average_precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as ImbPipeline

In [33]:
# Function to create preprocessing pipeline
def create_preprocessing_pipeline(numeric_features, categorical_features=None):
    """
    Create a scikit-learn preprocessing pipeline for numeric and categorical features
    """
    transformers = []
    
    # Numeric features pipeline
    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler())
    ])
    transformers.append(('num', numeric_transformer, numeric_features))
    
    # Categorical features pipeline (if any)
    if categorical_features:
        categorical_transformer = Pipeline(steps=[
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('onehot', OneHotEncoder(handle_unknown='ignore'))
        ])
        transformers.append(('cat', categorical_transformer, categorical_features))
    
    # Create the preprocessor
    preprocessor = ColumnTransformer(transformers=transformers)
    
    return preprocessor

In [34]:
# Function to train and evaluate models with different resampling techniques
def train_evaluate_models(X, y, preprocessor, models_dict, cv=5):
    """
    Train and evaluate different models with different resampling strategies
    """
    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    # Resampling strategies
    resampling_strategies = {
        'No Resampling': None,
        'SMOTE': SMOTE(random_state=42),
        'ADASYN': ADASYN(random_state=42),
        'Undersampling': RandomUnderSampler(random_state=42)
    }
    
    results = {}
    models = {}
    
    # For visualization
    plt.figure(figsize=(20, 15))
    
    for strategy_name, resampler in resampling_strategies.items():
        print(f"\n{'-'*50}\nTraining models with {strategy_name}\n{'-'*50}")
        
        strategy_results = {}
        
        for model_name, model in models_dict.items():
            print(f"\nTraining {model_name}...")
            
            # Create pipeline with or without resampling
            if resampler:
                pipeline = ImbPipeline([
                    ('preprocessor', preprocessor),
                    ('resampler', resampler),
                    ('classifier', model)
                ])
            else:
                pipeline = Pipeline([
                    ('preprocessor', preprocessor),
                    ('classifier', model)
                ])
            
            # Train and evaluate the model
            pipeline.fit(X_train, y_train)
            y_pred = pipeline.predict(X_test)
            y_pred_proba = pipeline.predict_proba(X_test)[:, 1]
            
            # Calculate metrics
            f1 = f1_score(y_test, y_pred)
            
            # Calculate ROC curve and AUC
            fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
            roc_auc = auc(fpr, tpr)
            
            # Calculate Precision-Recall curve and AUC
            precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
            pr_auc = average_precision_score(y_test, y_pred_proba)
            
            # Store results
            strategy_results[model_name] = {
                'pipeline': pipeline,
                'f1_score': f1,
                'roc_auc': roc_auc,
                'pr_auc': pr_auc,
                'fpr': fpr,
                'tpr': tpr,
                'precision': precision,
                'recall': recall,
                'y_pred': y_pred,
                'y_pred_proba': y_pred_proba
            }
            
            print(f"F1 Score: {f1:.4f}")
            print(f"ROC AUC: {roc_auc:.4f}")
            print(f"PR AUC: {pr_auc:.4f}")
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred))
            
            # Plot confusion matrix
            plt.subplot(4, len(models_dict), len(models_dict) * list(resampling_strategies.keys()).index(strategy_name) + list(models_dict.keys()).index(model_name) + 1)
            conf_matrix = confusion_matrix(y_test, y_pred)
            sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', cbar=False)
            plt.title(f"{strategy_name} - {model_name}\nConfusion Matrix")
            plt.ylabel('True Label')
            plt.xlabel('Predicted Label')
            
            # Save the model
            models[f"{strategy_name}_{model_name}"] = pipeline
        
        results[strategy_name] = strategy_results
    
    plt.tight_layout()
    plt.savefig('confusion_matrices.png')
    plt.close()
    
    # Plot ROC curves
    plt.figure(figsize=(15, 10))
    for i, (strategy_name, strategy_results) in enumerate(results.items()):
        plt.subplot(2, 2, i+1)
        for model_name, metrics in strategy_results.items():
            plt.plot(metrics['fpr'], metrics['tpr'], label=f"{model_name} (AUC = {metrics['roc_auc']:.2f})")
        
        plt.plot([0, 1], [0, 1], 'k--')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title(f'ROC Curve - {strategy_name}')
        plt.legend(loc='lower right')
    
    plt.tight_layout()
    plt.savefig('roc_curves.png')
    plt.close()
    
    # Plot Precision-Recall curves
    plt.figure(figsize=(15, 10))
    for i, (strategy_name, strategy_results) in enumerate(results.items()):
        plt.subplot(2, 2, i+1)
        for model_name, metrics in strategy_results.items():
            plt.plot(metrics['recall'], metrics['precision'], label=f"{model_name} (AUC = {metrics['pr_auc']:.2f})")
        
        plt.xlabel('Recall')
        plt.ylabel('Precision')
        plt.title(f'Precision-Recall Curve - {strategy_name}')
        plt.legend(loc='lower left')
    
    plt.tight_layout()
    plt.savefig('pr_curves.png')
    plt.close()
    
    # Plot feature importance for tree-based models
    for strategy_name, strategy_results in results.items():
        for model_name, metrics in strategy_results.items():
            if model_name in ['Random Forest', 'Gradient Boosting', 'XGBoost']:
                pipeline = metrics['pipeline']
                
                # Get feature names after preprocessing
                if hasattr(pipeline['preprocessor'], 'get_feature_names_out'):
                    feature_names = pipeline['preprocessor'].get_feature_names_out()
                else:
                    feature_names = [f'feature_{i}' for i in range(pipeline['classifier'].feature_importances_.shape[0])]
                
                # Get feature importances
                if model_name == 'XGBoost':
                    importances = pipeline['classifier'].feature_importances_
                else:
                    importances = pipeline['classifier'].feature_importances_
                
                # Plot
                plt.figure(figsize=(10, 6))
                indices = np.argsort(importances)[::-1]
                plt.title(f'Feature Importances - {strategy_name} - {model_name}')
                plt.barh(range(min(10, len(indices))), importances[indices][:10], align='center')
                plt.yticks(range(min(10, len(indices))), [feature_names[i] for i in indices][:10])
                plt.xlabel('Relative Importance')
                plt.tight_layout()
                plt.savefig(f'feature_importance_{strategy_name}_{model_name}.png')
                plt.close()
    
    return results, models, X_test, y_test


In [36]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 6 columns):
 #   Column                     Non-Null Count  Dtype
---  ------                     --------------  -----
 0   Temperature (°C)           5000 non-null   int64
 1   Humidity (%)               5000 non-null   int64
 2   Precipitation (mm)         5000 non-null   int64
 3   Soil Moisture (%)          5000 non-null   int64
 4   Elevation (m)              5000 non-null   int64
 5   Landslide Risk Prediction  5000 non-null   int64
dtypes: int64(6)
memory usage: 234.5 KB


In [40]:
# Main execution
if __name__ == "__main__":
    # Load the data (replace with your file path)
    
    # Define features and target
    X = df.drop('Landslide Risk Prediction', axis=1)  # Assuming 'Landslide' is the target column
    y = df['Landslide Risk Prediction']
    
    # Check class imbalance
    print("\nClass distribution:")
    print(y.value_counts())
    print(f"Class imbalance ratio: 1:{y.value_counts()[0]/y.value_counts()[1]:.2f}")
    
    # Define feature types
    numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
    categorical_features = X.select_dtypes(include=['object', 'category']).columns.tolist()
    
    # Create preprocessing pipeline
    preprocessor = create_preprocessing_pipeline(numeric_features, categorical_features)
    
    # Define models to try
    models_dict = {
        'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=42),
        'Random Forest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
        'Gradient Boosting': GradientBoostingClassifier(n_estimators=100, random_state=42),
        'SVM': SVC(probability=True, class_weight='balanced', random_state=42),
        'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
    }
    
    # Train and evaluate models
    results, models, X_test, y_test = train_evaluate_models(X, y, preprocessor, models_dict)
    
    # Find best model based on F1 score
    best_f1 = 0
    best_model_name = ""
    best_strategy = ""
    
    for strategy_name, strategy_results in results.items():
        for model_name, metrics in strategy_results.items():
            if metrics['f1_score'] > best_f1:
                best_f1 = metrics['f1_score']
                best_model_name = model_name
                best_strategy = strategy_name
    
    print(f"\nBest model: {best_model_name} with {best_strategy}, F1 Score: {best_f1:.4f}")
      # Save the best model
    import joblib
    best_model = models[f"{best_strategy}_{best_model_name}"]
    joblib.dump(best_model, 'best_landslide_model.pkl')
    print("Best model saved as 'best_landslide_model.pkl'")


Class distribution:
Landslide Risk Prediction
0    4591
1     409
Name: count, dtype: int64
Class imbalance ratio: 1:11.22

--------------------------------------------------
Training models with No Resampling
--------------------------------------------------

Training Logistic Regression...
F1 Score: 0.5776
ROC AUC: 0.9713
PR AUC: 0.7446

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.87      0.93       918
           1       0.41      0.98      0.58        82

    accuracy                           0.88      1000
   macro avg       0.70      0.93      0.75      1000
weighted avg       0.95      0.88      0.90      1000


Training Random Forest...
F1 Score: 0.9939
ROC AUC: 0.9999
PR AUC: 0.9984

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       918
           1       1.00      0.99      0.99        82

    accuracy                           1.00

Parameters: { "use_label_encoder" } are not used.



F1 Score: 0.9939
ROC AUC: 1.0000
PR AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       918
           1       1.00      0.99      0.99        82

    accuracy                           1.00      1000
   macro avg       1.00      0.99      1.00      1000
weighted avg       1.00      1.00      1.00      1000


Training Gradient Boosting...
F1 Score: 1.0000
ROC AUC: 1.0000
PR AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       918
           1       1.00      1.00      1.00        82

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000


Training SVM...
F1 Score: 0.8482
ROC AUC: 0.9963
PR AUC: 0.9606

Classification Report:
              precision    recall  f1-score   support

           0       1.00    

Parameters: { "use_label_encoder" } are not used.



F1 Score: 0.9939
ROC AUC: 1.0000
PR AUC: 0.9999

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       918
           1       1.00      0.99      0.99        82

    accuracy                           1.00      1000
   macro avg       1.00      0.99      1.00      1000
weighted avg       1.00      1.00      1.00      1000


Training Gradient Boosting...
F1 Score: 1.0000
ROC AUC: 1.0000
PR AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       918
           1       1.00      1.00      1.00        82

    accuracy                           1.00      1000
   macro avg       1.00      1.00      1.00      1000
weighted avg       1.00      1.00      1.00      1000


Training SVM...
F1 Score: 0.8283
ROC AUC: 0.9959
PR AUC: 0.9573

Classification Report:
              precision    recall  f1-score   support

           0       1.00    

Parameters: { "use_label_encoder" } are not used.




Training Random Forest...
F1 Score: 0.9820
ROC AUC: 1.0000
PR AUC: 1.0000

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       918
           1       0.96      1.00      0.98        82

    accuracy                           1.00      1000
   macro avg       0.98      1.00      0.99      1000
weighted avg       1.00      1.00      1.00      1000


Training Gradient Boosting...
F1 Score: 0.9704
ROC AUC: 1.0000
PR AUC: 0.9996

Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      1.00       918
           1       0.94      1.00      0.97        82

    accuracy                           0.99      1000
   macro avg       0.97      1.00      0.98      1000
weighted avg       1.00      0.99      1.00      1000


Training SVM...
F1 Score: 0.6949
ROC AUC: 0.9937
PR AUC: 0.9352

Classification Report:
              precision    recall  f1-score   support



Parameters: { "use_label_encoder" } are not used.




Best model: Gradient Boosting with SMOTE, F1 Score: 1.0000
Best model saved as 'best_landslide_model.pkl'
