### Objective:
We will generate synthetic data and build a machine learning model to predict the probability of a landslide occurring in a given month.



### Step 1: Import Necessary Libraries 

In [5]:
# Import essential libraries
import pandas as pd  # For data handling
import numpy as np  # For generating random numbers
from sklearn.model_selection import train_test_split  # Splitting data into training/testing sets
from sklearn.preprocessing import StandardScaler  # Standardizing numerical values
from sklearn.linear_model import LogisticRegression  # ML model for classification
from sklearn.metrics import accuracy_score, classification_report  # Model evaluation


### Step 2: Generate Synthetic Data

In [6]:
# Define the range of years and months for the dataset
years = range(2020, 2025)  # From 2020 to 2024
months = range(1, 13)  # January to December

# Create an empty list to store the data
data = []

# Loop through each year and month to generate data
for year in years:
    for month in months:
        # Generate random weather values
        avg_precipitation = np.random.uniform(50, 300)  # Avg rainfall in mm
        max_temp = np.random.uniform(25, 40)  # Max temperature in °C
        min_temp = np.random.uniform(10, 25)  # Min temperature in °C
        avg_temp = (max_temp + min_temp) / 2  # Compute average temperature
        soil_moisture = np.random.uniform(20, 100)  # Soil moisture percentage
        
        # Determine landslide probability based on weather conditions
        landslide_prob = (avg_precipitation * 0.3 + soil_moisture * 0.5) / 100
        
        # If the probability is greater than a random threshold, assume a landslide occurs (1), otherwise no landslide (0)
        landslide_occurrence = 1 if landslide_prob > np.random.rand() else 0

        # Append the generated data
        data.append([year, month, avg_precipitation, max_temp, min_temp, avg_temp, soil_moisture, landslide_occurrence])

# Convert the list into a pandas DataFrame
df = pd.DataFrame(data, columns=['year', 'month', 'avg_precipitation', 'max_temp', 'min_temp', 'avg_temp', 'soil_moisture', 'landslide_occurrence'])

# Display the first few rows of the dataset
df.head()


The history saving thread hit an unexpected error (OperationalError('attempt to write a readonly database')).History will not be written to the database.


Unnamed: 0,year,month,avg_precipitation,max_temp,min_temp,avg_temp,soil_moisture,landslide_occurrence
0,2020,1,50.548829,35.712476,12.916481,24.314479,88.794509,0
1,2020,2,129.25992,39.86379,23.839942,31.851866,29.925975,1
2,2020,3,111.850408,35.010641,10.784164,22.897402,85.734189,1
3,2020,4,216.940505,27.927224,13.778148,20.852686,39.118027,1
4,2020,5,56.133334,31.669008,11.668643,21.668825,75.774917,1


📌 This block of code generates synthetic data for five years (2020-2024) with random weather conditions and assigns a landslide occurrence based on rainfall and soil moisture levels.

### Step 3: Save Dataset to CSV

In [7]:
# Save the dataset as a CSV file for future use
df.to_csv('landslide_data.csv', index=False)

# Confirm that the file has been saved
print("Dataset saved as 'landslide_data.csv'")


Dataset saved as 'landslide_data.csv'


### Step 4: Load and Prepare Data for ML

In [8]:
# Load the dataset from the CSV file
df = pd.read_csv('landslide_data.csv')

# Display dataset information
df.info()

# Display first few rows
df.head()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60 entries, 0 to 59
Data columns (total 8 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   year                  60 non-null     int64  
 1   month                 60 non-null     int64  
 2   avg_precipitation     60 non-null     float64
 3   max_temp              60 non-null     float64
 4   min_temp              60 non-null     float64
 5   avg_temp              60 non-null     float64
 6   soil_moisture         60 non-null     float64
 7   landslide_occurrence  60 non-null     int64  
dtypes: float64(5), int64(3)
memory usage: 3.9 KB


Unnamed: 0,year,month,avg_precipitation,max_temp,min_temp,avg_temp,soil_moisture,landslide_occurrence
0,2020,1,50.548829,35.712476,12.916481,24.314479,88.794509,0
1,2020,2,129.25992,39.86379,23.839942,31.851866,29.925975,1
2,2020,3,111.850408,35.010641,10.784164,22.897402,85.734189,1
3,2020,4,216.940505,27.927224,13.778148,20.852686,39.118027,1
4,2020,5,56.133334,31.669008,11.668643,21.668825,75.774917,1


### Step 5: Split Data into Features (X) and Target (y)

In [9]:
# Define the input features (X) and target variable (y)
X = df[['avg_precipitation', 'max_temp', 'min_temp', 'avg_temp', 'soil_moisture']]  # Features
y = df['landslide_occurrence']  # Target (1: Landslide, 0: No Landslide)

# Split the dataset into 80% training and 20% testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Display dataset shape
print("Training data shape:", X_train.shape)
print("Testing data shape:", X_test.shape)


Training data shape: (48, 5)
Testing data shape: (12, 5)


📌 We split the data into training and testing sets so that our model learns patterns from 80% of the data and is evaluated on the remaining 20%.

### Step 6: Standardize the Data

In [10]:
# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the training data
X_train = scaler.fit_transform(X_train)

# Transform the test data using the same scaler
X_test = scaler.transform(X_test)


📌 We standardize the numerical data so that all features have the same scale, which helps machine learning models perform better.

### Step 7: Train a Machine Learning Model

In [11]:
# Initialize and train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)


📌 We use logistic regression, a simple yet effective algorithm for binary classification.

### Step 8: Evaluate the Model

In [12]:
# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

# Display a detailed classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.67

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00         3
           1       0.73      0.89      0.80         9

    accuracy                           0.67        12
   macro avg       0.36      0.44      0.40        12
weighted avg       0.55      0.67      0.60        12



### Step 9: Make Predictions for New Data

In [13]:
# Example: Predicting landslide probability for new weather data
new_data = [[180, 36, 22, 29, 80]]  # avg_precipitation, max_temp, min_temp, avg_temp, soil_moisture

# Scale the new data
new_data = scaler.transform(new_data)

# Predict landslide probability
probability = model.predict_proba(new_data)[:, 1]

# Display prediction
print(f"Landslide Probability: {probability[0] * 100:.2f}%")


Landslide Probability: 91.35%


