Total Population

Total Population

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns
target_columns = ['totalpop', 'totalmale', 'totalfemale', 'totalcaste',
                  'malecaste', 'femalecaste', 'totaltribes', 'maletribes',
                  'femaletribes', 'totalliterates', 'maleliterates',
                  'femaleliterates', 'totalilliterates', 'maleilliterates',
                  'femaleilliterates']

# Create lagged features and growth rates for each target column
for col in target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Feature matrix for training (concatenate ward encoded, year, lagged population and growth rates)
lagged_features = [f'lag_{col}' for col in target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Target variable (for the selected target column)
target_column = 'totalpop'
print(f"\nTraining model for '{target_column}'...")
y = data[target_column].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize a StandardScaler for feature scaling
scaler = StandardScaler()

# Define models to evaluate
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
    'Gradient Boosting': GradientBoostingRegressor(),
    'AdaBoost': AdaBoostRegressor(),
    'XGBoost': XGBRegressor(),
    'LightGBM': LGBMRegressor(verbose=-1),
    'K-Nearest Neighbors': KNeighborsRegressor(),
    'Support Vector Regressor': SVR()
}

# Best model initialization
best_model_total_pop = None
best_r2 = -np.inf

# Evaluate models
for model_name, model in models.items():
    # Create a pipeline with feature scaling and the model
    pipeline = Pipeline([
        ('scaler', scaler),
        ('model', model)
    ])

    # Fit the pipeline to the training data
    pipeline.fit(X_train, y_train)

    # Predict on the test data
    y_pred = pipeline.predict(X_test)

    # Calculate R² score
    r2 = r2_score(y_test, y_pred)
    print(f"{model_name} R2 Score: {r2:.4f}")

    # Track the best model
    if r2 > best_r2:
        best_r2 = r2
        best_model_total_pop = pipeline  # Store the entire pipeline

print(f"Best model for '{target_column}': {best_model_total_pop.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

# Prepare for predictions for 2031 and 2041
# Create a DataFrame for predictions for 2031 and 2041
predictions = []

# Store 2031 predictions in the original dataset for use in 2041 predictions
data['predicted_totalpop_2031'] = np.nan  # Create a column for 2031 predictions

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare the features for predictions in 2031
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in target_columns]])

    # Combine features for prediction for 2031
    X_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_2031 has the same number of features as training data
    n_features_expected = X_train.shape[1]  # Number of features model was trained on
    n_features_current = X_2031.shape[1]    # Current number of features for prediction

    if n_features_current < n_features_expected:
        # Pad with zeros if features are missing
        padding = np.zeros((1, n_features_expected - n_features_current))
        X_2031 = np.hstack([X_2031, padding])

    # Now predict population for this ward in 2031
    population_2031 = best_model_total_pop.predict(X_2031)

    # Store the 2031 prediction back into the original dataset
    data.loc[data['wardno'] == ward, 'predicted_totalpop_2031'] = population_2031[0]

    # Now predict for 2041 using the 2031 predicted value as an additional feature
    lagged_features_2041 = np.array([[population_2031[0]]])  # Use 2031 as lag for 2041
    X_2041 = np.hstack([ward_encoded, year_since_census_2041, lagged_features_2041, growth_rate_features])

    # Ensure X_2041 has the same number of features as training data
    n_features_current_2041 = X_2041.shape[1]    # Current number of features for 2041 prediction
    if n_features_current_2041 < n_features_expected:
        # Pad with zeros if features are missing
        padding_2041 = np.zeros((1, n_features_expected - n_features_current_2041))
        X_2041 = np.hstack([X_2041, padding_2041])

    # Predict population for 2041
    population_2041 = best_model_total_pop.predict(X_2041)

    # Ensure that the population for 2041 is greater than 2031 by adding a growth factor
    if population_2041[0] <= population_2031[0]:
        population_2041[0] = population_2031[0] * 1.05  # Enforcing a minimum 5% growth

    # Append predictions for this ward
    predictions.append({
        'wardno': ward,
        'predicted_totalpop_2031': population_2031[0],
        'predicted_totalpop_2041': population_2041[0]
    })

# Convert predictions to a DataFrame
predictions_df_total_pop = pd.DataFrame(predictions)

# Display predictions
print(predictions_df_total_pop)

# Optionally, save predictions to a CSV
predictions_df_total_pop.to_csv('population_predictions_2031_2041.csv', index=False)



Training model for 'totalpop'...
Linear Regression R2 Score: -2165962420704134477905920.0000
Decision Tree R2 Score: 0.8291
Random Forest R2 Score: 0.8058
Gradient Boosting R2 Score: 0.8643
AdaBoost R2 Score: 0.7675
XGBoost R2 Score: 0.8586
LightGBM R2 Score: 0.7965
K-Nearest Neighbors R2 Score: 0.3609
Support Vector Regressor R2 Score: -0.1325
Best model for 'totalpop': GradientBoostingRegressor with R2 Score: 0.8643
     wardno  predicted_totalpop_2031  predicted_totalpop_2041
0         1             43671.669822             45855.253313
1         2             24304.855490             25520.098265
2         3             22940.527735             24087.554122
3         4             25387.588603             26656.968033
4         5             18599.959533             19529.957509
..      ...                      ...                      ...
157     158              6416.398836              6737.218778
158     159              1608.108170              3466.013749
159     160        

Male and Female Population

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns
target_columns = ['totalpop', 'totalmale', 'totalfemale', 'totalcaste',
                  'malecaste', 'femalecaste', 'totaltribes', 'maletribes',
                  'femaletribes', 'totalliterates', 'maleliterates',
                  'femaleliterates', 'totalilliterates', 'maleilliterates',
                  'femaleilliterates']

# Create lagged features and growth rates for each target column
for col in target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Function to train and predict for a given target column
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Prepare feature matrix for training
lagged_features = [f'lag_{col}' for col in target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Train models for total male and total female
best_model_total_male = train_and_predict('totalmale')
best_model_total_female = train_and_predict('totalfemale')

# New dataset to store predictions
predicted_data = pd.DataFrame(columns=['wardno', 'year', 'predicted_totalmale', 'predicted_totalfemale'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare for predictions for 2031 and 2041 for males and females
predictions_2031 = []  # List to hold 2031 predictions
predictions_2041 = []  # List to hold 2041 predictions

# Prepare the features for predictions in 2031 and 2041
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in target_columns]])

    # Combine features for prediction for 2031
    X_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_2031 has the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_2031 = X_2031.shape[1]

    if n_features_current_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_2031 = np.zeros((1, n_features_expected - n_features_current_2031))
        X_2031 = np.hstack([X_2031, padding_2031])

    # Now predict total male and total female populations for this ward in 2031
    male_population_2031 = best_model_total_male.predict(X_2031)
    female_population_2031 = best_model_total_female.predict(X_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_totalmale': male_population_2031[0],
        'predicted_totalfemale': female_population_2031[0]
    })

    # Calculate the required population for 2041 (5% increase over 2031)
    required_population_2041 = 1.05 * (male_population_2031[0] + female_population_2031[0])

    # Adjust predictions to ensure the population for 2041 is greater than for 2031
    male_population_2041 = male_population_2031[0] * 1.05  # Adjusting male population to ensure total is greater
    female_population_2041 = required_population_2041 - male_population_2041

    # Store the predictions for 2041
    predictions_2041.append({
        'wardno': ward,
        'year': 2041,
        'predicted_totalmale': male_population_2041,
        'predicted_totalfemale': female_population_2041
    })

# Append all 2031 predictions to the predicted_data DataFrame
for prediction in predictions_2031:
    predicted_data = pd.concat([predicted_data, pd.DataFrame([prediction])], ignore_index=True)

# Append all 2041 predictions to the predicted_data DataFrame
for prediction in predictions_2041:
    predicted_data = pd.concat([predicted_data, pd.DataFrame([prediction])], ignore_index=True)

# Save the new dataset with predictions
predicted_data.to_csv('population_predictions_new.csv', index=False)



Training model for 'totalmale'...
Linear Regression R2 Score: -1890825537394970311262208.0000
Decision Tree R2 Score: 0.8105
Random Forest R2 Score: 0.8075
Gradient Boosting R2 Score: 0.8624
AdaBoost R2 Score: 0.7442
XGBoost R2 Score: 0.8552
LightGBM R2 Score: 0.7920
K-Nearest Neighbors R2 Score: 0.3639
Support Vector Regressor R2 Score: -0.1442
Best model for 'totalmale': GradientBoostingRegressor with R2 Score: 0.8624

Training model for 'totalfemale'...
Linear Regression R2 Score: -2488827276684964990550016.0000
Decision Tree R2 Score: 0.8029
Random Forest R2 Score: 0.8066
Gradient Boosting R2 Score: 0.8630
AdaBoost R2 Score: 0.7649
XGBoost R2 Score: 0.8588
LightGBM R2 Score: 0.7957
K-Nearest Neighbors R2 Score: 0.3559
Support Vector Regressor R2 Score: -0.1263
Best model for 'totalfemale': GradientBoostingRegressor with R2 Score: 0.8630


  predicted_data = pd.concat([predicted_data, pd.DataFrame([prediction])], ignore_index=True)


Total Caste

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns
target_columns = ['totalpop', 'totalmale', 'totalfemale', 'totalcaste',
                  'malecaste', 'femalecaste', 'totaltribes', 'maletribes',
                  'femaletribes', 'totalliterates', 'maleliterates',
                  'femaleliterates', 'totalilliterates', 'maleilliterates',
                  'femaleilliterates']

# Create lagged features and growth rates for each target column
for col in target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Function to train and predict for a given target column
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Prepare feature matrix for training
lagged_features = [f'lag_{col}' for col in target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Train models for total caste
best_model_total_caste = train_and_predict('totalcaste')

# New dataset to store predictions
predicted_data_caste = pd.DataFrame(columns=['wardno', 'year', 'predicted_totalcaste'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare for predictions for 2031 and 2041 for total caste
predictions_2031 = []  # List to hold 2031 predictions
predictions_2041 = []  # List to hold 2041 predictions

# Prepare the features for predictions in 2031 and 2041
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in target_columns]])

    # Combine features for prediction for 2031
    X_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_2031 has the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_2031 = X_2031.shape[1]

    if n_features_current_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_2031 = np.zeros((1, n_features_expected - n_features_current_2031))
        X_2031 = np.hstack([X_2031, padding_2031])

    # Now predict total caste for this ward in 2031
    total_caste_2031 = best_model_total_caste.predict(X_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_totalcaste': total_caste_2031[0]
    })

    # Calculate the required population for 2041 (5% increase over 2031)
    required_population_2041 = 1.05 * total_caste_2031[0]

    # Adjust predictions to ensure the population for 2041 is greater than for 2031
    total_caste_2041 = required_population_2041

    # Store the predictions for 2041
    predictions_2041.append({
        'wardno': ward,
        'year': 2041,
        'predicted_totalcaste': total_caste_2041
    })

# Append all 2031 predictions to the predicted_data_caste DataFrame
for prediction in predictions_2031:
    predicted_data_caste = pd.concat([predicted_data_caste, pd.DataFrame([prediction])], ignore_index=True)

# Append all 2041 predictions to the predicted_data_caste DataFrame
for prediction in predictions_2041:
    predicted_data_caste = pd.concat([predicted_data_caste, pd.DataFrame([prediction])], ignore_index=True)

# Save the new dataset with total caste predictions
predicted_data_caste.to_csv('total_caste_predictions_new.csv', index=False)



Training model for 'totalcaste'...
Linear Regression R2 Score: -2301288285584196794056704.0000
Decision Tree R2 Score: 0.8976
Random Forest R2 Score: 0.8115
Gradient Boosting R2 Score: 0.8358
AdaBoost R2 Score: 0.6319
XGBoost R2 Score: 0.8341
LightGBM R2 Score: 0.8150
K-Nearest Neighbors R2 Score: 0.2176
Support Vector Regressor R2 Score: -0.1096
Best model for 'totalcaste': DecisionTreeRegressor with R2 Score: 0.8976


  predicted_data_caste = pd.concat([predicted_data_caste, pd.DataFrame([prediction])], ignore_index=True)


Male and Female Caste

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns
target_columns = ['totalpop', 'totalmale', 'totalfemale', 'totalcaste',
                  'malecaste', 'femalecaste', 'totaltribes', 'maletribes',
                  'femaletribes', 'totalliterates', 'maleliterates',
                  'femaleliterates', 'totalilliterates', 'maleilliterates',
                  'femaleilliterates']

# Create lagged features and growth rates for each target column
for col in target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Function to train and predict for a given target column
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Prepare feature matrix for training
lagged_features = [f'lag_{col}' for col in target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Train models for male caste
best_model_male_caste = train_and_predict('malecaste')

# Train models for female caste
best_model_female_caste = train_and_predict('femalecaste')

# New dataset to store predictions
predicted_data_caste = pd.DataFrame(columns=['wardno', 'year', 'predicted_malecaste', 'predicted_femalecaste'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare lists to hold predictions for each year
predictions_2031 = []  # List to hold predictions for 2031
predictions_2041 = []  # List to hold predictions for 2041

# Prepare the features for predictions in 2031 and 2041 for male and female caste
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in target_columns]])

    # Combine features for prediction for 2031
    X_male_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])
    X_female_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_male_2031 and X_female_2031 have the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_male_2031 = X_male_2031.shape[1]
    n_features_current_female_2031 = X_female_2031.shape[1]

    if n_features_current_male_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_male_2031 = np.zeros((1, n_features_expected - n_features_current_male_2031))
        X_male_2031 = np.hstack([X_male_2031, padding_male_2031])

    if n_features_current_female_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_female_2031 = np.zeros((1, n_features_expected - n_features_current_female_2031))
        X_female_2031 = np.hstack([X_female_2031, padding_female_203ian])

    # Now predict male and female caste for this ward in 2031
    male_caste_2031 = best_model_male_caste.predict(X_male_2031)
    female_caste_2031 = best_model_female_caste.predict(X_female_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_malecaste': male_caste_2031[0],
        'predicted_femalecaste': female_caste_2031[0]
    })

# After collecting all predictions for 2031, now calculate for 2041
for ward in unique_wards:
    # Calculate the required male and female caste populations for 2041 (5% increase over 2031)
    male_caste_2031 = next((pred['predicted_malecaste'] for pred in predictions_2031 if pred['wardno'] == ward), None)
    female_caste_2031 = next((pred['predicted_femalecaste'] for pred in predictions_2031 if pred['wardno'] == ward), None)

    if male_caste_2031 is not None and female_caste_2031 is not None:
        required_male_caste_2041 = 1.05 * male_caste_2031
        required_female_caste_2041 = 1.05 * female_caste_2031

        # Store the predictions for 2041
        predictions_2041.append({
            'wardno': ward,
            'year': 2041,
            'predicted_malecaste': required_male_caste_2041,
            'predicted_femalecaste': required_female_caste_2041
        })

# Combine both predictions into the final DataFrame
predicted_data_caste = pd.DataFrame(predictions_2031 + predictions_2041)

# Save the predictions to a CSV file
predicted_data_caste.to_csv('predicted_female_male_caste_population.csv', index=False)




Training model for 'malecaste'...
Linear Regression R2 Score: -2275980458400530701484032.0000
Decision Tree R2 Score: 0.9032
Random Forest R2 Score: 0.8035
Gradient Boosting R2 Score: 0.8597
AdaBoost R2 Score: 0.6354
XGBoost R2 Score: 0.8187
LightGBM R2 Score: 0.8229
K-Nearest Neighbors R2 Score: 0.2163
Support Vector Regressor R2 Score: -0.1070
Best model for 'malecaste': DecisionTreeRegressor with R2 Score: 0.9032

Training model for 'femalecaste'...
Linear Regression R2 Score: -2323141853548898139766784.0000
Decision Tree R2 Score: 0.7786
Random Forest R2 Score: 0.8101
Gradient Boosting R2 Score: 0.8334
AdaBoost R2 Score: 0.6244
XGBoost R2 Score: 0.7767
LightGBM R2 Score: 0.8190
K-Nearest Neighbors R2 Score: 0.2183
Support Vector Regressor R2 Score: -0.1093
Best model for 'femalecaste': GradientBoostingRegressor with R2 Score: 0.8334

Predictions for male and female caste populations have been saved to 'predicted_caste_population.csv'.


Total Tribes

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns
target_columns = ['totalpop', 'totalmale', 'totalfemale', 'totalcaste',
                  'malecaste', 'femalecaste', 'totaltribes', 'maletribes',
                  'femaletribes', 'totalliterates', 'maleliterates',
                  'femaleliterates', 'totalilliterates', 'maleilliterates',
                  'femaleilliterates']

# Create lagged features and growth rates for each target column
for col in target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Function to train and predict for a given target column
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Prepare feature matrix for training
lagged_features = [f'lag_{col}' for col in target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Train models for total tribes
best_model_total_tribes = train_and_predict('totaltribes')

# New dataset to store predictions
predicted_data_tribes = pd.DataFrame(columns=['wardno', 'year', 'predicted_totaltribes'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare for predictions for 2031 and 2041 for total tribes
predictions_2031 = []  # List to hold 2031 predictions
predictions_2041 = []  # List to hold 2041 predictions

# Prepare the features for predictions in 2031 and 2041
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in target_columns]])

    # Combine features for prediction for 2031
    X_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_2031 has the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_2031 = X_2031.shape[1]

    if n_features_current_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_2031 = np.zeros((1, n_features_expected - n_features_current_2031))
        X_2031 = np.hstack([X_2031, padding_2031])

    # Now predict total tribes for this ward in 2031
    total_tribes_2031 = best_model_total_tribes.predict(X_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_totaltribes': total_tribes_2031[0]
    })

    # Calculate the required population for 2041 (5% increase over 2031)
    required_population_2041 = 1.05 * total_tribes_2031[0]

    # Adjust predictions to ensure the population for 2041 is greater than for 2031
    total_tribes_2041 = required_population_2041

    # Store the predictions for 2041
    predictions_2041.append({
        'wardno': ward,
        'year': 2041,
        'predicted_totaltribes': total_tribes_2041
    })

# Append all 2031 predictions to the predicted_data_tribes DataFrame
for prediction in predictions_2031:
    predicted_data_tribes = pd.concat([predicted_data_tribes, pd.DataFrame([prediction])], ignore_index=True)

# Append all 2041 predictions to the predicted_data_tribes DataFrame
for prediction in predictions_2041:
    predicted_data_tribes = pd.concat([predicted_data_tribes, pd.DataFrame([prediction])], ignore_index=True)

# Save the new dataset with total tribes predictions
predicted_data_tribes.to_csv('total_tribes_predictions_new.csv', index=False)

print("Predictions for total tribes for the years 2031 and 2041 have been saved successfully.")



Training model for 'totaltribes'...
Linear Regression R2 Score: -411936981419927455924224.0000
Decision Tree R2 Score: 0.1139
Random Forest R2 Score: 0.7377
Gradient Boosting R2 Score: 0.7527
AdaBoost R2 Score: 0.5996
XGBoost R2 Score: 0.8195
LightGBM R2 Score: 0.7873
K-Nearest Neighbors R2 Score: 0.1662
Support Vector Regressor R2 Score: -0.2315
Best model for 'totaltribes': XGBRegressor with R2 Score: 0.8195
Predictions for total tribes for the years 2031 and 2041 have been saved successfully.


  predicted_data_tribes = pd.concat([predicted_data_tribes, pd.DataFrame([prediction])], ignore_index=True)


Male Female Tribes

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns for tribes
tribe_target_columns = ['totaltribes', 'maletribes', 'femaletribes']

# Create lagged features and growth rates for each target column
for col in tribe_target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Function to train and predict for a given target column
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Prepare feature matrix for training
lagged_features = [f'lag_{col}' for col in tribe_target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in tribe_target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Train models for male tribes
best_model_male_tribes = train_and_predict('maletribes')

# Train models for female tribes
best_model_female_tribes = train_and_predict('femaletribes')

# New dataset to store predictions
predicted_data_tribes = pd.DataFrame(columns=['wardno', 'year', 'predicted_maletribes', 'predicted_femaletribes'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare lists to hold predictions for each year
predictions_2031 = []  # List to hold predictions for 2031
predictions_2041 = []  # List to hold predictions for 2041

# Prepare the features for predictions in 2031 and 2041 for male and female tribes
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in tribe_target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in tribe_target_columns]])

    # Combine features for prediction for 2031
    X_male_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])
    X_female_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_male_2031 and X_female_2031 have the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_male_2031 = X_male_2031.shape[1]
    n_features_current_female_2031 = X_female_2031.shape[1]

    if n_features_current_male_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_male_2031 = np.zeros((1, n_features_expected - n_features_current_male_2031))
        X_male_2031 = np.hstack([X_male_2031, padding_male_2031])

    if n_features_current_female_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_female_2031 = np.zeros((1, n_features_expected - n_features_current_female_2031))
        X_female_2031 = np.hstack([X_female_2031, padding_female_2031])

    # Now predict male and female tribes for this ward in 2031
    male_tribes_2031 = best_model_male_tribes.predict(X_male_2031)
    female_tribes_2031 = best_model_female_tribes.predict(X_female_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_maletribes': male_tribes_2031[0],
        'predicted_femaletribes': female_tribes_2031[0]
    })

# After collecting all predictions for 2031, now calculate for 2041
for ward in unique_wards:
    # Calculate the required male and female tribes populations for 2041 (5% increase over 2031)
    male_tribes_2031 = next((pred['predicted_maletribes'] for pred in predictions_2031 if pred['wardno'] == ward), None)
    female_tribes_2031 = next((pred['predicted_femaletribes'] for pred in predictions_2031 if pred['wardno'] == ward), None)

    if male_tribes_2031 is not None and female_tribes_2031 is not None:
        required_male_tribes_2041 = 1.05 * male_tribes_2031
        required_female_tribes_2041 = 1.05 * female_tribes_2031

        # Store the predictions for 2041
        predictions_2041.append({
            'wardno': ward,
            'year': 2041,
            'predicted_maletribes': required_male_tribes_2041,
            'predicted_femaletribes': required_female_tribes_2041
        })

# Combine both predictions into the final DataFrame
predicted_data_tribes = pd.DataFrame(predictions_2031)
predicted_data_tribes_2041 = pd.DataFrame(predictions_2041)

# Concatenate the predictions for both years
predicted_data_tribes = pd.concat([predicted_data_tribes, predicted_data_tribes_2041], ignore_index=True)

# Save predictions to a CSV file
predicted_data_tribes.to_csv('predicted_tribal_population.csv', index=False)

print(predicted_data_tribes.head())  # Display the first few predictions



Training model for 'maletribes'...
Linear Regression R2 Score: -20167867732856906681679872.0000
Decision Tree R2 Score: 0.7466
Random Forest R2 Score: 0.7331
Gradient Boosting R2 Score: 0.8048
AdaBoost R2 Score: 0.7080
XGBoost R2 Score: 0.8244
LightGBM R2 Score: 0.7539
K-Nearest Neighbors R2 Score: 0.2898
Support Vector Regressor R2 Score: -0.2332
Best model for 'maletribes': XGBRegressor with R2 Score: 0.8244

Training model for 'femaletribes'...
Linear Regression R2 Score: -44376641761009661084958720.0000
Decision Tree R2 Score: 0.8114
Random Forest R2 Score: 0.7924
Gradient Boosting R2 Score: 0.7630
AdaBoost R2 Score: 0.5278
XGBoost R2 Score: 0.8278
LightGBM R2 Score: 0.8053
K-Nearest Neighbors R2 Score: 0.3175
Support Vector Regressor R2 Score: -0.2295
Best model for 'femaletribes': XGBRegressor with R2 Score: 0.8278
   wardno  year  predicted_maletribes  predicted_femaletribes
0       1  2031            821.283875              728.394836
1       2  2031            127.467636     

Total Literates

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# Create a target column for total literates
target_column = 'totalliterates'  # Adjust this if your column name is different

# Create lagged features and growth rates for total literates
data[f'lag_{target_column}'] = data[target_column].shift(1)
data[f'growth_rate_{target_column}'] = data[target_column].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Prepare feature matrix for training
lagged_features = [f'lag_{target_column}']
growth_rate_features = [f'growth_rate_{target_column}']
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Function to train and predict for total literates
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Train the model for total literates
best_model_totalliterates = train_and_predict(target_column)

# New dataset to store predictions
predicted_data_literates = pd.DataFrame(columns=['wardno', 'year', 'predicted_totalliterates'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare lists to hold predictions for each year
predictions_2031 = []  # List to hold predictions for 2031
predictions_2041 = []  # List to hold predictions for 2041

# Prepare the features for predictions in 2031 and 2041 for total literates
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{target_column}']]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{target_column}']]])

    # Combine features for prediction for 2031
    X_totalliterates_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_totalliterates_2031 has the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_totalliterates_2031 = X_totalliterates_2031.shape[1]

    if n_features_current_totalliterates_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_totalliterates_2031 = np.zeros((1, n_features_expected - n_features_current_totalliterates_2031))
        X_totalliterates_2031 = np.hstack([X_totalliterates_2031, padding_totalliterates_2031])

    # Now predict total literates for this ward in 2031
    totalliterates_2031 = best_model_totalliterates.predict(X_totalliterates_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_totalliterates': totalliterates_2031[0]
    })

# After collecting all predictions for 2031, now calculate for 2041
for ward in unique_wards:
    # Calculate the required total literates populations for 2041 (5% increase over 2031)
    totalliterates_2031 = next((pred['predicted_totalliterates'] for pred in predictions_2031 if pred['wardno'] == ward), None)

    if totalliterates_2031 is not None:
        required_totalliterates_2041 = 1.05 * totalliterates_2031

        # Store the predictions for 2041
        predictions_2041.append({
            'wardno': ward,
            'year': 2041,
            'predicted_totalliterates': required_totalliterates_2041
        })

# Combine both predictions into the final DataFrame
predicted_data_literates = pd.DataFrame(predictions_2031)
predicted_data_literates_2041 = pd.DataFrame(predictions_2041)

# Concatenate the predictions for both years
predicted_data_literates = pd.concat([predicted_data_literates, predicted_data_literates_2041], ignore_index=True)

# Save predictions to a CSV file
predicted_data_literates.to_csv('predicted_total_literates.csv', index=False)

print(predicted_data_literates.head())  # Display the first few predictions



Training model for 'totalliterates'...
Linear Regression R2 Score: -60318507297460231715422208.0000
Decision Tree R2 Score: 0.8121
Random Forest R2 Score: 0.7952
Gradient Boosting R2 Score: 0.8273
AdaBoost R2 Score: 0.7410
XGBoost R2 Score: 0.8322
LightGBM R2 Score: 0.7872
K-Nearest Neighbors R2 Score: 0.4529
Support Vector Regressor R2 Score: -0.1371
Best model for 'totalliterates': XGBRegressor with R2 Score: 0.8322
   wardno  year  predicted_totalliterates
0       1  2031              34844.382812
1       2  2031              20769.185547
2       3  2031              17797.339844
3       4  2031              20462.048828
4       5  2031              14283.001953


Male Female Literates

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns for literates
literacy_target_columns = ['totalliterates', 'maleliterates', 'femaleliterates']

# Create lagged features and growth rates for each target column
for col in literacy_target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Function to train and predict for a given target column
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Prepare feature matrix for training
lagged_features = [f'lag_{col}' for col in literacy_target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in literacy_target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Train models for male literates
best_model_male_literates = train_and_predict('maleliterates')

# Train models for female literates
best_model_female_literates = train_and_predict('femaleliterates')

# New dataset to store predictions
predicted_data_literates = pd.DataFrame(columns=['wardno', 'year', 'predicted_maleliterates', 'predicted_femaleliterates'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare lists to hold predictions for each year
predictions_2031 = []  # List to hold predictions for 2031
predictions_2041 = []  # List to hold predictions for 2041

# Prepare the features for predictions in 2031 and 2041 for male and female literates
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in literacy_target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in literacy_target_columns]])

    # Combine features for prediction for 2031
    X_male_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])
    X_female_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_male_2031 and X_female_2031 have the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_male_2031 = X_male_2031.shape[1]
    n_features_current_female_2031 = X_female_2031.shape[1]

    if n_features_current_male_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_male_2031 = np.zeros((1, n_features_expected - n_features_current_male_2031))
        X_male_2031 = np.hstack([X_male_2031, padding_male_2031])

    if n_features_current_female_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_female_2031 = np.zeros((1, n_features_expected - n_features_current_female_2031))
        X_female_2031 = np.hstack([X_female_2031, padding_female_2031])

    # Now predict male and female literates for this ward in 2031
    male_literates_2031 = best_model_male_literates.predict(X_male_2031)
    female_literates_2031 = best_model_female_literates.predict(X_female_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_maleliterates': male_literates_2031[0],
        'predicted_femaleliterates': female_literates_2031[0]
    })

# After collecting all predictions for 2031, now calculate for 2041
for ward in unique_wards:
    # Calculate the required male and female literates populations for 2041 (5% increase over 2031)
    male_literates_2031 = next((pred['predicted_maleliterates'] for pred in predictions_2031 if pred['wardno'] == ward), None)
    female_literates_2031 = next((pred['predicted_femaleliterates'] for pred in predictions_2031 if pred['wardno'] == ward), None)

    if male_literates_2031 is not None and female_literates_2031 is not None:
        required_male_literates_2041 = 1.05 * male_literates_2031
        required_female_literates_2041 = 1.05 * female_literates_2031

        # Store the predictions for 2041
        predictions_2041.append({
            'wardno': ward,
            'year': 2041,
            'predicted_maleliterates': required_male_literates_2041,
            'predicted_femaleliterates': required_female_literates_2041
        })

# Convert predictions lists to DataFrames
predictions_2031_df = pd.DataFrame(predictions_2031)
predictions_2041_df = pd.DataFrame(predictions_2041)

# Concatenate predictions DataFrames
final_predictions = pd.concat([predictions_2031_df, predictions_2041_df], ignore_index=True)

# Save to CSV
final_predictions.to_csv('predicted_male_female_literates_2031_2041.csv', index=False)

print("Predictions for male and female literates saved to 'predicted_literates_2031_2041.csv'.")



Training model for 'maleliterates'...
Linear Regression R2 Score: -69241624542363208847982592.0000
Decision Tree R2 Score: 0.7632
Random Forest R2 Score: 0.7935
Gradient Boosting R2 Score: 0.8281
AdaBoost R2 Score: 0.7161
XGBoost R2 Score: 0.8156
LightGBM R2 Score: 0.7729
K-Nearest Neighbors R2 Score: 0.4837
Support Vector Regressor R2 Score: -0.1462
Best model for 'maleliterates': GradientBoostingRegressor with R2 Score: 0.8281

Training model for 'femaleliterates'...
Linear Regression R2 Score: -82820569146318825338175488.0000
Decision Tree R2 Score: 0.8227
Random Forest R2 Score: 0.8029
Gradient Boosting R2 Score: 0.8341
AdaBoost R2 Score: 0.7424
XGBoost R2 Score: 0.8446
LightGBM R2 Score: 0.7892
K-Nearest Neighbors R2 Score: 0.4723
Support Vector Regressor R2 Score: -0.1253
Best model for 'femaleliterates': XGBRegressor with R2 Score: 0.8446
Predictions for male and female literates saved to 'predicted_literates_2031_2041.csv'.


Total Illiterates

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# Create a target column for total illiterates
target_column = 'totalilliterates'  # Adjust this if your column name is different

# Create lagged features and growth rates for total illiterates
data[f'lag_{target_column}'] = data[target_column].shift(1)
data[f'growth_rate_{target_column}'] = data[target_column].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Prepare feature matrix for training
lagged_features = [f'lag_{target_column}']
growth_rate_features = [f'growth_rate_{target_column}']
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Function to train and predict for total illiterates
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Train the model for total illiterates
best_model_totalillerates = train_and_predict(target_column)

# New dataset to store predictions
predicted_data_illiterates = pd.DataFrame(columns=['wardno', 'year', 'predicted_totalillerates'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare lists to hold predictions for each year
predictions_2031 = []  # List to hold predictions for 2031
predictions_2041 = []  # List to hold predictions for 2041

# Prepare the features for predictions in 2031 and 2041 for total illiterates
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{target_column}']]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{target_column}']]])

    # Combine features for prediction for 2031
    X_totalillerates_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_totalillerates_2031 has the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_totalillerates_2031 = X_totalillerates_2031.shape[1]

    if n_features_current_totalillerates_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_totalillerates_2031 = np.zeros((1, n_features_expected - n_features_current_totalillerates_2031))
        X_totalillerates_2031 = np.hstack([X_totalillerates_2031, padding_totalillerates_2031])

    # Now predict total illiterates for this ward in 2031
    totalillerates_2031 = best_model_totalillerates.predict(X_totalillerates_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_totalillerates': totalillerates_2031[0]
    })

# After collecting all predictions for 2031, now calculate for 2041
for ward in unique_wards:
    # Calculate the required total illiterates populations for 2041 (5% increase over 2031)
    totalillerates_2031 = next((pred['predicted_totalillerates'] for pred in predictions_2031 if pred['wardno'] == ward), None)

    if totalillerates_2031 is not None:
        required_totalillerates_2041 = 1.05 * totalillerates_2031

        # Store the predictions for 2041
        predictions_2041.append({
            'wardno': ward,
            'year': 2041,
            'predicted_totalillerates': required_totalillerates_2041
        })

# Combine both predictions into the final DataFrame
predicted_data_illiterates = pd.DataFrame(predictions_2031)
predicted_data_illiterates_2041 = pd.DataFrame(predictions_2041)

# Concatenate the predictions for both years
predicted_data_illiterates = pd.concat([predicted_data_illiterates, predicted_data_illiterates_2041], ignore_index=True)

# Save predictions to a CSV file
predicted_data_illiterates.to_csv('predicted_total_illiterates.csv', index=False)

print(predicted_data_illiterates.head())  # Display the first few predictions



Training model for 'totalilliterates'...
Linear Regression R2 Score: -79177981839695226014269440.0000
Decision Tree R2 Score: 0.8458
Random Forest R2 Score: 0.8199
Gradient Boosting R2 Score: 0.8745
AdaBoost R2 Score: 0.7281
XGBoost R2 Score: 0.8457
LightGBM R2 Score: 0.8098
K-Nearest Neighbors R2 Score: 0.3777
Support Vector Regressor R2 Score: -0.0939
Best model for 'totalilliterates': GradientBoostingRegressor with R2 Score: 0.8745
   wardno  year  predicted_totalillerates
0       1  2031               9079.909330
1       2  2031               5880.028647
2       3  2031               5064.622170
3       4  2031               4179.939781
4       5  2031               4266.239259


Male Female Illiterates

In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR

# Load your data
data = pd.read_csv('population_data.csv')  # Uncomment and specify your path

# List of target columns for illiterates
illiteracy_target_columns = ['totalilliterates', 'maleilliterates', 'femaleilliterates']

# Create lagged features and growth rates for each target column
for col in illiteracy_target_columns:
    data[f'lag_{col}'] = data[col].shift(1)
    data[f'growth_rate_{col}'] = data[col].pct_change()

# Replace Inf and NaN values with 0 in growth rates (to handle division by zero)
data.replace([np.inf, -np.inf], np.nan, inplace=True)
data.fillna(0, inplace=True)

# Drop rows with missing values after lagging
data.dropna(inplace=True)

# Prepare features: OneHotEncode the ward numbers
encoder = OneHotEncoder(sparse_output=False)
ward_encoded = encoder.fit_transform(data['wardno'].values.reshape(-1, 1))

# Prepare the year feature (number of years since 1991)
years_since_census = data['year'] - 1991

# Function to train and predict for a given target column
def train_and_predict(target_column):
    print(f"\nTraining model for '{target_column}'...")
    y = data[target_column].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize a StandardScaler for feature scaling
    scaler = StandardScaler()

    # Define models to evaluate
    models = {
        'Linear Regression': LinearRegression(),
        'Decision Tree': DecisionTreeRegressor(),
        'Random Forest': RandomForestRegressor(),
        'Gradient Boosting': GradientBoostingRegressor(),
        'AdaBoost': AdaBoostRegressor(),
        'XGBoost': XGBRegressor(),
        'LightGBM': LGBMRegressor(verbose=-1),
        'K-Nearest Neighbors': KNeighborsRegressor(),
        'Support Vector Regressor': SVR()
    }

    # Best model initialization
    best_model = None
    best_r2 = -np.inf

    # Evaluate models
    for model_name, model in models.items():
        # Create a pipeline with feature scaling and the model
        pipeline = Pipeline([
            ('scaler', scaler),
            ('model', model)
        ])

        # Fit the pipeline to the training data
        pipeline.fit(X_train, y_train)

        # Predict on the test data
        y_pred = pipeline.predict(X_test)

        # Calculate R² score
        r2 = r2_score(y_test, y_pred)
        print(f"{model_name} R2 Score: {r2:.4f}")

        # Track the best model
        if r2 > best_r2:
            best_r2 = r2
            best_model = pipeline  # Store the entire pipeline

    print(f"Best model for '{target_column}': {best_model.named_steps['model'].__class__.__name__} with R2 Score: {best_r2:.4f}")

    return best_model

# Prepare feature matrix for training
lagged_features = [f'lag_{col}' for col in illiteracy_target_columns]
growth_rate_features = [f'growth_rate_{col}' for col in illiteracy_target_columns]
X = np.hstack([ward_encoded, years_since_census.values.reshape(-1, 1),
                data[lagged_features + growth_rate_features].values])

# Train models for male illiterates
best_model_male_illiterates = train_and_predict('maleilliterates')

# Train models for female illiterates
best_model_female_illiterates = train_and_predict('femaleilliterates')

# New dataset to store predictions
predicted_data_illiterates = pd.DataFrame(columns=['wardno', 'year', 'predicted_maleilliterates', 'predicted_femaleilliterates'])

# List of unique ward numbers
unique_wards = data['wardno'].unique()

# Prepare lists to hold predictions for each year
predictions_2031 = []  # List to hold predictions for 2031
predictions_2041 = []  # List to hold predictions for 2041

# Prepare the features for predictions in 2031 and 2041 for male and female illiterates
for ward in unique_wards:
    # For each ward, prepare the features for 2031
    ward_encoded = encoder.transform([[ward]])  # One-hot encode the ward number
    year_since_census_2031 = np.array([[2031 - 1991]])  # Feature representing 2031
    year_since_census_2041 = np.array([[2041 - 1991]])  # Feature representing 2041

    # Use the most recent data for lagged and growth rate features
    recent_data = data[data['wardno'] == ward].iloc[-1]

    # Creating lagged features and growth rate features for the latest available data
    lagged_features = np.array([[recent_data[f'lag_{col}'] for col in illiteracy_target_columns]])
    growth_rate_features = np.array([[recent_data[f'growth_rate_{col}'] for col in illiteracy_target_columns]])

    # Combine features for prediction for 2031
    X_male_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])
    X_female_2031 = np.hstack([ward_encoded, year_since_census_2031, lagged_features, growth_rate_features])

    # Ensure X_male_2031 and X_female_2031 have the same number of features as training data
    n_features_expected = X.shape[1]  # Number of features model was trained on
    n_features_current_male_2031 = X_male_2031.shape[1]
    n_features_current_female_2031 = X_female_2031.shape[1]

    if n_features_current_male_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_male_2031 = np.zeros((1, n_features_expected - n_features_current_male_2031))
        X_male_2031 = np.hstack([X_male_2031, padding_male_2031])

    if n_features_current_female_2031 < n_features_expected:
        # Pad with zeros if features are missing
        padding_female_2031 = np.zeros((1, n_features_expected - n_features_current_female_2031))
        X_female_2031 = np.hstack([X_female_2031, padding_female_2031])

    # Now predict male and female illiterates for this ward in 2031
    male_illiterates_2031 = best_model_male_illiterates.predict(X_male_2031)
    female_illiterates_2031 = best_model_female_illiterates.predict(X_female_2031)

    # Store the predictions for 2031
    predictions_2031.append({
        'wardno': ward,
        'year': 2031,
        'predicted_maleilliterates': male_illiterates_2031[0],
        'predicted_femaleilliterates': female_illiterates_2031[0]
    })

# After collecting all predictions for 2031, now calculate for 2041
for ward in unique_wards:
    # Calculate the required male and female illiterates populations for 2041 (5% increase over 2031)
    male_illiterates_2031 = next((pred['predicted_maleilliterates'] for pred in predictions_2031 if pred['wardno'] == ward), None)
    female_illiterates_2031 = next((pred['predicted_femaleilliterates'] for pred in predictions_2031 if pred['wardno'] == ward), None)

    if male_illiterates_2031 is not None and female_illiterates_2031 is not None:
        required_male_illiterates_2041 = 1.05 * male_illiterates_2031
        required_female_illiterates_2041 = 1.05 * female_illiterates_2031

        # Store the predictions for 2041
        predictions_2041.append({
            'wardno': ward,
            'year': 2041,
            'predicted_maleilliterates': required_male_illiterates_2041,
            'predicted_femaleilliterates': required_female_illiterates_2041
        })

# Convert predictions to DataFrame
predicted_data_illiterates = pd.DataFrame(predictions_2041)

# Save the predicted data
predicted_data_illiterates.to_csv('predicted_illiterates_data.csv', index=False)



Training model for 'maleilliterates'...
Linear Regression R2 Score: -52681384278966728130560.0000
Decision Tree R2 Score: 0.8613
Random Forest R2 Score: 0.8279
Gradient Boosting R2 Score: 0.8846
AdaBoost R2 Score: 0.7650
XGBoost R2 Score: 0.8652
LightGBM R2 Score: 0.8190
K-Nearest Neighbors R2 Score: 0.3810
Support Vector Regressor R2 Score: -0.1175
Best model for 'maleilliterates': GradientBoostingRegressor with R2 Score: 0.8846

Training model for 'femaleilliterates'...
Linear Regression R2 Score: -69813635307596295169900544.0000
Decision Tree R2 Score: 0.8180
Random Forest R2 Score: 0.8191
Gradient Boosting R2 Score: 0.8729
AdaBoost R2 Score: 0.7172
XGBoost R2 Score: 0.8357
LightGBM R2 Score: 0.8060
K-Nearest Neighbors R2 Score: 0.3896
Support Vector Regressor R2 Score: -0.0861
Best model for 'femaleilliterates': GradientBoostingRegressor with R2 Score: 0.8729
