<a href="https://colab.research.google.com/github/PranitaAnnaldas/A-predictive-model-to-assess-the-environmental-impact-of-urban-expansion/blob/master/ml/Final_Population.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import warnings

warnings.filterwarnings("ignore")

# Load data
data = pd.read_csv('population_data.csv')

# List of target columns
target_columns = ['totalpop', 'totalmale', 'totalfemale', 'totalcaste',
                  'malecaste', 'femalecaste', 'totaltribes', 'maletribes',
                  'femaletribes', 'totalliterates', 'maleliterates',
                  'femaleliterates', 'totalilliterates', 'maleilliterates',
                  'femaleilliterates']

# Create lagged features and growth rates
for col in target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop NaN rows due to lagging
data.dropna(inplace=True)

# OneHotEncoder for ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded_full = encoder.fit_transform(data[['wardno']])

# Store predictions
predictions_dict = {'wardno': data['wardno'].unique()}

# Iterate over each target column
for target_column in target_columns:
    print(f"\nTraining model for '{target_column}'...")

    # Prepare feature matrix
    lagged_features = [f'lag_{col}' for col in target_columns]
    growth_rate_features = [f'growth_rate_{col}' for col in target_columns]
    feature_matrix = data[lagged_features + growth_rate_features]

    if feature_matrix.shape[0] != ward_encoded_full.shape[0]:
        raise ValueError(f"Mismatch in feature rows: {feature_matrix.shape[0]} vs. {ward_encoded_full.shape[0]}")

    # Combine features
    X = np.hstack([ward_encoded_full, data['year'].values.reshape(-1, 1) - 1991, feature_matrix.values])
    y = data[target_column].values

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # StandardScaler
    scaler = StandardScaler()

    # Define models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Find the best model
    best_model = None
    best_r2 = -np.inf

    for model_name, model in models.items():
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    # Prepare for 2021 and 2031 predictions
    predictions_2021 = []
    predictions_2031 = []

    for ward in data['wardno'].unique():
        ward_encoded = encoder.transform([[ward]])
        year_since_census_2021 = np.array([[2021 - 1991]])
        year_since_census_2031 = np.array([[2031 - 1991]])

        # Get latest data for this ward
        recent_data = data[data['wardno'] == ward].iloc[-1]

        # Lagged and growth rate features
        lagged_features_2021 = np.array([[recent_data[f'lag_{col}'] for col in target_columns]])
        growth_rate_features_2021 = np.array([[recent_data[f'growth_rate_{col}'] for col in target_columns]])

        # Predict for 2021
        X_2021 = np.hstack([ward_encoded, year_since_census_2021, lagged_features_2021, growth_rate_features_2021])
        population_2021 = np.round(best_model.predict(X_2021)).astype(int)

        # Predict for 2031 using 2021 as lagged feature
        lagged_features_2031 = np.array([[population_2021[0]]])
        X_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features_2031, growth_rate_features_2021])

        # Apply a population growth factor similar to the real data trend
        growth_factor = 1.5  # ~50% increase per decade (based on metropolitan trends)
        population_2031 = np.round(populations_2021 * growth_factor).astype(int)

        # Store predictions
        predictions_2021.append(population_2021[0])
        predictions_2031.append(population_2031[0])

    predictions_dict[f'predicted_2021_{target_column}'] = predictions_2021
    predictions_dict[f'predicted_2031_{target_column}'] = predictions_2031

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions_dict)

# Save predictions
predictions_df.to_csv('population_predictions_2021_2031.csv', index=False)

# Display predictions
print(predictions_df)


Training model for 'totalpop'...
Linear Regression R2 Score: -0.0885
Decision Tree R2 Score: 0.7003
Random Forest R2 Score: 0.8552
Gradient Boosting R2 Score: 0.8817
AdaBoost R2 Score: 0.7121
XGBoost R2 Score: 0.8377
LightGBM R2 Score: 0.7902
K-Nearest Neighbors R2 Score: 0.4367
Support Vector Regressor R2 Score: -0.0563
Best model for 'totalpop': GradientBoostingRegressor with R2 Score: 0.8817

Training model for 'totalmale'...
Linear Regression R2 Score: -0.0966
Decision Tree R2 Score: 0.5612
Random Forest R2 Score: 0.8709
Gradient Boosting R2 Score: 0.8782
AdaBoost R2 Score: 0.7057
XGBoost R2 Score: 0.8324
LightGBM R2 Score: 0.7941
K-Nearest Neighbors R2 Score: 0.4501
Support Vector Regressor R2 Score: -0.0632
Best model for 'totalmale': GradientBoostingRegressor with R2 Score: 0.8782

Training model for 'totalfemale'...
Linear Regression R2 Score: -0.0784
Decision Tree R2 Score: 0.6952
Random Forest R2 Score: 0.8369
Gradient Boosting R2 Score: 0.8793
AdaBoost R2 Score: 0.7317
XGBo

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import warnings

warnings.filterwarnings("ignore")

# Load data
data = pd.read_csv('population_data.csv')

# List of target columns
target_columns = ['totalpop', 'totalmale', 'totalfemale', 'totalcaste',
                  'malecaste', 'femalecaste', 'totaltribes', 'maletribes',
                  'femaletribes', 'totalliterates', 'maleliterates',
                  'femaleliterates', 'totalilliterates', 'maleilliterates',
                  'femaleilliterates']

# Create lagged features and growth rates
for col in target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop NaN rows due to lagging
data.dropna(inplace=True)

# OneHotEncoder for ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded_full = encoder.fit_transform(data[['wardno']])

# Store predictions
predictions_dict = {'wardno': data['wardno'].unique()}

# Iterate over each target column
for target_column in target_columns:
    print(f"\nTraining model for '{target_column}'...")

    # Prepare feature matrix
    lagged_features = [f'lag_{col}' for col in target_columns]
    growth_rate_features = [f'growth_rate_{col}' for col in target_columns]
    feature_matrix = data[lagged_features + growth_rate_features]

    if feature_matrix.shape[0] != ward_encoded_full.shape[0]:
        raise ValueError(f"Mismatch in feature rows: {feature_matrix.shape[0]} vs. {ward_encoded_full.shape[0]}")

    # Combine features
    X = np.hstack([ward_encoded_full, data['year'].values.reshape(-1, 1) - 1991, feature_matrix.values])
    y = data[target_column].values

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # StandardScaler
    scaler = StandardScaler()

    # Define models
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Find the best model
    best_model = None
    best_r2 = -np.inf

    for model_name, model in models.items():
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_test)
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    # Prepare for 2021 and 2031 predictions
    predictions_2021 = []
    predictions_2031 = []

    for ward in data['wardno'].unique():
        ward_encoded = encoder.transform([[ward]])

        # Get latest known data for this ward
        recent_data = data[data['wardno'] == ward].iloc[-1]

        # Prepare lagged and growth rate features for 2021
        lagged_features_input = np.array([[recent_data[f'lag_{col}'] for col in target_columns]])
        growth_rate_features_input = np.array([[recent_data[f'growth_rate_{col}'] for col in target_columns]])

        # Predict for 2021
        year_since_census_2021 = np.array([[2021 - 1991]])
        X_2021 = np.hstack([ward_encoded, year_since_census_2021, lagged_features_input, growth_rate_features_input])
        population_2021 = np.round(best_model.predict(X_2021)).astype(int)

        # Now use 2021 prediction as new lagged feature for 2031
        lagged_features_2031 = lagged_features_input.copy()
        # Update only the target_column's lagged feature
        index = target_columns.index(target_column)
        lagged_features_2031[0][index] = population_2021[0]

        # Growth rates remain the same
        year_since_census_2031 = np.array([[2031 - 1991]])
        X_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features_2031, growth_rate_features_input])
        population_2031 = np.round(best_model.predict(X_2031)).astype(int)

        # Store predictions
        predictions_2021.append(population_2021[0])
        predictions_2031.append(population_2031[0])

    predictions_dict[f'predicted_2021_{target_column}'] = predictions_2021
    predictions_dict[f'predicted_2031_{target_column}'] = predictions_2031

# Convert predictions to DataFrame
predictions_df = pd.DataFrame(predictions_dict)

# Save predictions
predictions_df.to_csv('population_predictions_2021_2031.csv', index=False)

# Display predictions
print(predictions_df)



Training model for 'totalpop'...
Linear Regression R2 Score: 0.6162
Decision Tree R2 Score: 0.8859
Random Forest R2 Score: 0.9330
Gradient Boosting R2 Score: 0.9457
AdaBoost R2 Score: 0.7650
XGBoost R2 Score: 0.9256
LightGBM R2 Score: 0.9076
K-Nearest Neighbors R2 Score: 0.4425
Support Vector Regressor R2 Score: -0.0326
Best model for 'totalpop': GradientBoostingRegressor with R2 Score: 0.9457

Training model for 'totalmale'...
Linear Regression R2 Score: 0.6135
Decision Tree R2 Score: 0.8890
Random Forest R2 Score: 0.9250
Gradient Boosting R2 Score: 0.9481
AdaBoost R2 Score: 0.7806
XGBoost R2 Score: 0.9278
LightGBM R2 Score: 0.9136
K-Nearest Neighbors R2 Score: 0.4575
Support Vector Regressor R2 Score: -0.0314
Best model for 'totalmale': GradientBoostingRegressor with R2 Score: 0.9481

Training model for 'totalfemale'...
Linear Regression R2 Score: 0.6161
Decision Tree R2 Score: 0.8939
Random Forest R2 Score: 0.9246
Gradient Boosting R2 Score: 0.9465
AdaBoost R2 Score: 0.7645
XGBoost