In [197]:
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [198]:
# Load the data
data = pd.read_csv('data.csv')

In [199]:
# Min-Max normalization for Rainfall and Population
data['Normalized Rainfall'] = (data['Average Annual Rainfall (inches)'] - data['Average Annual Rainfall (inches)'].min()) / (data['Average Annual Rainfall (inches)'].max() - data['Average Annual Rainfall (inches)'].min())

data['Normalized Population'] = (data['Population'] - data['Population'].min()) / (data['Population'].max() - data['Population'].min())

# Compute afforestation suitability score using normalized values
data['afforestation_score'] = (data['Normalized Rainfall'] * 0.3 +
                               data['Soil Suitability (0 to 1)'] * 0.4 +
                               data['Wildlife Benefit Potential (0 to 1)'] * 0.2 -
                               data['Normalized Population'] * 0.1)



In [201]:
print(data["afforestation_score"].describe())  # Before normalization

count    264.000000
mean       0.483857
std        0.156027
min        0.063621
25%        0.397851
50%        0.542267
75%        0.590207
max        0.859516
Name: afforestation_score, dtype: float64


In [202]:
# Define a reasonable raw score threshold based on domain knowledge
raw_threshold = 0.5  # Adjust this based on your data
data["good_for_afforestation"] = (data["afforestation_score"] > raw_threshold).astype(int)

In [203]:
# Select features for modeling
features = ['Average Annual Rainfall (inches)', 'Soil Suitability (0 to 1)',
           'Wildlife Benefit Potential (0 to 1)', 'Population']

In [204]:
X = data[features]
y = data['good_for_afforestation']

X_temp, X_test, y_temp, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42
)
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=0.15, random_state=42
)

# Standardize the features
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

In [205]:
print(X_train[:5])  # Show first 5 rows
print(X_test[:5])   # Show first 5 rows

[[0.31411863 0.82352941 0.6875     0.00525367]
 [0.40601504 0.94117647 0.875      0.10319582]
 [0.23809524 0.64705882 0.5625     0.16263498]
 [0.34586466 0.94117647 0.8125     0.00936995]
 [0.35004177 0.88235294 0.75       0.01702871]]
[[0.05597327 0.17647059 0.25       0.00699099]
 [0.35254804 0.94117647 0.8125     0.05294993]
 [0.28738513 0.82352941 0.6875     0.01121874]
 [0.36340852 0.94117647 0.8125     0.07832109]
 [0.35421888 0.88235294 0.8125     0.0073989 ]]


In [206]:
# Set up cross-validation
kfold = KFold(n_splits=5, shuffle=True, random_state=42)

In [207]:
# Define base XGBoost model
base_model = xgb.XGBClassifier(
    objective="binary:logistic",
    random_state=42
)

In [208]:
# Perform cross-validation
cv_scores = cross_val_score(base_model, X_train, y_train, cv=kfold, scoring='accuracy')
print("\nCross-validation scores:", cv_scores)
print(f"Mean CV accuracy: {cv_scores.mean():.4f}")
print(f"CV standard deviation: {cv_scores.std():.4f}")


Cross-validation scores: [0.97222222 0.97222222 0.91666667 1.         0.91428571]
Mean CV accuracy: 0.9551
CV standard deviation: 0.0339


In [209]:
param_grid = {
    'max_depth': [3, 4, 5],
    'learning_rate': [0.05, 0.1, 0.2],
    'n_estimators': [50, 100, 150],
    'subsample': [0.8, 0.9, 1.0]
}

In [210]:
grid_search = GridSearchCV(
    estimator=base_model,
    param_grid=param_grid,
    cv=kfold,
    scoring='accuracy',
    verbose=1
)

grid_search.fit(X_train, y_train)
print("\nBest parameters:", grid_search.best_params_)
print(f"Best cross-validation score: {grid_search.best_score_:.4f}")

Fitting 5 folds for each of 81 candidates, totalling 405 fits

Best parameters: {'learning_rate': 0.2, 'max_depth': 3, 'n_estimators': 100, 'subsample': 0.8}
Best cross-validation score: 0.9719


In [211]:
# Train the final model with the best parameters on the combined training and validation sets
best_model = grid_search.best_estimator_
X_train_val = np.vstack((X_train, X_val))
y_train_val = pd.concat([y_train, y_val])

best_model.fit(X_train_val, y_train_val)

In [212]:
# Validate on the validation set before final testing
val_predictions = best_model.predict(X_val)
print("\nValidation Set Results:")
print(f"Accuracy: {accuracy_score(y_val, val_predictions):.4f}")
print("\nValidation Classification Report:")
print(classification_report(y_val, val_predictions))


Validation Set Results:
Accuracy: 1.0000

Validation Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00        16

    accuracy                           1.00        32
   macro avg       1.00      1.00      1.00        32
weighted avg       1.00      1.00      1.00        32



In [213]:
# Final evaluation on the test set
test_predictions = best_model.predict(X_test)
print("\nTest Set Results:")
print(f"Accuracy: {accuracy_score(y_test, test_predictions):.4f}")
print("\nTest Classification Report:")
print(classification_report(y_test, test_predictions))


Test Set Results:
Accuracy: 1.0000

Test Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        13
           1       1.00      1.00      1.00        40

    accuracy                           1.00        53
   macro avg       1.00      1.00      1.00        53
weighted avg       1.00      1.00      1.00        53



In [214]:
# Function to get afforestation suitability by state
def get_afforestation_locations(state, model, features):
    """
    Input: State name
    Output: List of suitable locations for afforestation in the specified state.
    """
    state_data = data[data["State"] == state].copy()

    if state_data.empty:
        return f"No data available for {state}."

    # Prepare features for prediction
    X_state = state_data[features]

    # Predict suitability
    predictions = model.predict(X_state)
    probabilities = model.predict_proba(X_state)[:, 1]  # Probability of class 1

    # Add predictions to the state data
    state_data["Prediction"] = predictions
    state_data["Probability"] = probabilities

    # Filter for good locations (Prediction == 1)
    good_locations = state_data[state_data["Prediction"] == 1]

    if good_locations.empty:
        return f"No suitable locations found for afforestation in {state}."

    # Return only the location names
    return good_locations[["City", "Probability"]].sort_values(by="Probability", ascending=False)



In [215]:
# Basic Input-Output system
def main():
    state_input = input("Enter the state you want to check for afforestation suitability: ")
    result = get_afforestation_locations(state_input, best_model, features)

    if isinstance(result, str):  # If the result is a message (e.g., "No data available")
        print(result)
    else:
        print(f"Suitable locations for afforestation in {state_input}:")
        for index, row in result.iterrows():
            print(f"- {row['City']} (Probability: {row['Probability']:.4f})")


In [216]:


def predict_afforestation_suitability(model):
    # Create a feature array for the new location
    rainfall = float(input("Enter rainfall in inches : "))
    soil_suitability = float(input("Enter soil suitability (0 to 1) : "))
    wildlife_potential = float(input("Enter wildlife potential (0 to 1) : "))
    population = float(input("Enter population : "))
    new_location = np.array(
        [[rainfall, soil_suitability, wildlife_potential, population]]
    )

    # Make prediction
    prediction = model.predict(new_location)[0]
    probability = model.predict_proba(new_location)[0][1]

    if prediction == 1:
        suitability = "Good"
    else:
        suitability = "Not Good"

    return suitability, probability




In [217]:
if __name__ == "__main__":
    print("1. Select Location From Database")
    print("2. Predict New Location")
    print("3. Exit")

    while True:
        choice = int(input("Enter your choice: "))

        if choice == 1:
            main()  # Calls main() if user selects option 1

        elif choice == 2:
            result = predict_afforestation_suitability(best_model)  # Call the function
            print(f"Prediction Result: {result}")  # Print the returned result

        elif choice == 3:
            print("Exiting...")
            break  # Exit the loop

        else:
            print("Invalid choice! Please enter 1, 2, or 3.")


1. Select Location From Database
2. Predict New Location
3. Exit
Enter your choice: 2
Enter rainfall in inches : 35
Enter soil suitability (0 to 1) : 0.8
Enter wildlife potential (0 to 1) : 0.5
Enter population : 675758
Prediction Result: ('Good', 0.7980798)
Enter your choice: 2
Enter rainfall in inches : 40
Enter soil suitability (0 to 1) : 0.8
Enter wildlife potential (0 to 1) : 0.4
Enter population : 675867
Prediction Result: ('Good', 0.7980798)
Enter your choice: 3
Exiting...
