In [5]:
import pandas as pd

# Load the provided CSV files
gddp_data = pd.read_csv('GDDP Corrected.csv')
ntl_data = pd.read_csv('Final Harmonised NTL data (1992-2022).csv')

# Display the first few rows of each dataset to understand their structure and columns
gddp_data.head()

Unnamed: 0,year,1999,2000,2001,2002,2003,2004,2005,2006,2007,2008,2009,2010,2011,2012,2013
0,24-Parganas(N),1556252,1565724,1565724,1575196,1584668,1594140,1603612,1613084,1622556,1632028,1641500,1650972,1660444,1669916,1679388
1,24-Parganas(S),1083200,1164244,1164244,1245288,1326332,1407376,1488420,1569464,1650508,1731552,1812596,1893640,1974684,2055728,2136772
2,Adilabad,346328,374269,420940,409929,478185,454280,529286,597431,665576,679695,655734,725918,773474,788584,805135
3,Agra,454501,437274,459793,505860,542166,591591,630936,733828,836721,927423,985205,1045621,1134794,1207972,1285244
4,Ahmadabad,781700,1399650,2017600,2635550,3253500,3871450,4489400,5107350,5725300,6343250,7159630,7820030,8573180,9422900,10451060


In [7]:
ntl_data.head()

Unnamed: 0,Index,Name of District,1992,1993,1994,1995,1996,1997,1998,1999,...,2013,2014,2015,2016,2017,2018,2019,2020,2021,2022
0,1,Nicobar Islands,220,420,464,680,605,553,647,601,...,694,2976,2886,2083,3635,2965,3027,1813,1941,1829
1,2,North and Middle Andaman,446,1006,841,975,995,566,1059,1225,...,1620,5944,8289,6810,12337,14006,11524,7968,7133,8124
2,3,South Andaman,1637,2537,2512,3657,3517,2699,3381,2998,...,5099,7769,7931,7053,8809,9587,9390,7739,8354,8208
3,4,Anantapur,27229,53475,47946,72158,74760,77474,93342,102308,...,132777,167549,171366,163666,180271,189118,200801,195409,203452,211152
4,5,Chittoor,47807,70186,60455,88114,83701,79190,97608,106205,...,142092,153143,158948,146014,167317,176316,196402,193414,196893,209788


In [8]:
# Checking for common district names between GDDP and NTL data for alignment
gddp_districts = set(gddp_data['year'])
ntl_districts = set(ntl_data['Name of District'])

# Finding intersection of district names
common_districts = gddp_districts.intersection(ntl_districts)
len(common_districts), list(common_districts)[:5]  # Displaying the count and a sample of common districts

(448, ['Begusarai', 'Dindori', 'Aizawl', 'Tiruvannamalai', 'Nawada'])

In [9]:
# Filter both datasets to include only common districts
gddp_filtered = gddp_data[gddp_data['year'].isin(common_districts)]
ntl_filtered = ntl_data[ntl_data['Name of District'].isin(common_districts)]

# Renaming columns for consistency in merging
gddp_filtered = gddp_filtered.rename(columns={'year': 'District'})
ntl_filtered = ntl_filtered.rename(columns={'Name of District': 'District'})

# Dropping any unnecessary columns for easier merging and model preparation
gddp_filtered = gddp_filtered.set_index('District').sort_index()
ntl_filtered = ntl_filtered.set_index('District').drop(columns=['Index']).sort_index()

# Merging datasets on the district index
merged_data = pd.merge(gddp_filtered, ntl_filtered, left_index=True, right_index=True, suffixes=('_GDDP', '_NTL'))

# Displaying the merged data to confirm structure and alignment
merged_data.head()

Unnamed: 0_level_0,1999_GDDP,2000_GDDP,2001_GDDP,2002_GDDP,2003_GDDP,2004_GDDP,2005_GDDP,2006_GDDP,2007_GDDP,2008_GDDP,...,2013_NTL,2014,2015,2016,2017,2018,2019,2020,2021,2022
District,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Adilabad,346328,374269,420940,409929,478185,454280,529286,597431,665576,679695,...,103784,144103,151152,139406,164168,172182,179234,173477,191355,202845
Agra,454501,437274,459793,505860,542166,591591,630936,733828,836721,927423,...,53319,70542,71906,73116,89128,87488,89273,81005,83908,88192
Ahmadabad,781700,1399650,2017600,2635550,3253500,3871450,4489400,5107350,5725300,6343250,...,97965,121508,126525,124964,140608,148015,159181,158310,167028,182061
Ahmadnagar,768038,734745,761889,809818,852104,919611,1084679,1123297,1161915,1170792,...,153097,170825,186866,188351,218341,229807,261885,264640,269423,308900
Aizawl,65801,69272,69272,72743,76214,79685,83156,86627,90098,93569,...,4168,20336,20698,13848,23862,24093,24212,17973,20047,19893


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import numpy as np

# Extracting the years for model training (1999-2013 for both GDDP and NTL)
gddp_years = [f"{year}_GDDP" for year in range(1999, 2014)]
ntl_years = [f"{year}_NTL" for year in range(1999, 2014)]

# Preparing data for modeling
X = merged_data[ntl_years].values  # NTL data as features
y = merged_data[gddp_years].values  # GDDP data as target

# Splitting data into training and testing sets for model evaluation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initializing models
linear_model = LinearRegression()
rf_model = RandomForestRegressor(random_state=42, n_estimators=100)

# Training and evaluating Linear Regression model
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_rmse = np.sqrt(mean_squared_error(y_test, y_pred_linear))
linear_r2 = r2_score(y_test, y_pred_linear)

# Training and evaluating Random Forest model
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
rf_rmse = np.sqrt(mean_squared_error(y_test, y_pred_rf))
rf_r2 = r2_score(y_test, y_pred_rf)

# Displaying results
print('linear_rmse: ', linear_rmse, 'linear_r2: ', linear_r2, '\n',  'rf_rmse', rf_rmse, 'rf_r2', rf_r2)

linear_rmse:  753532.4776215011 linear_r2:  0.23672404348855408 
 rf_rmse 827522.7443812886 rf_r2 0.09924422578020972
