In [18]:
from Povert_Cleaning import clean_data
import pandas as pd
import numpy as np 
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from keras.models import Model
from keras.layers import Dense,Embedding, Flatten,Concatenate,Input
from tensorflow.keras.callbacks import EarlyStopping

In [19]:
import warnings
warnings.filterwarnings('ignore')

In [20]:
df = clean_data('pip_dataset.csv')
Poverty_2=df[['country', 'year', 'headcount_international_povline', 'total_shortfall_international_povline']].reset_index().drop(['index'], axis=1)
Poverty_2

<class 'pandas.core.frame.DataFrame'>
Index: 4411 entries, 0 to 4876
Columns: 108 entries, country to p50_p10_index
dtypes: float64(102), int64(3), object(3)
memory usage: 3.7+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 4411 entries, 0 to 4876
Columns: 108 entries, country to p50_p10_index
dtypes: float64(102), int64(3), object(3)
memory usage: 3.7+ MB


Unnamed: 0,country,year,headcount_international_povline,total_shortfall_international_povline
0,Albania,1996,29167.0,8.430020e+03
1,El Salvador,2009,456897.0,2.632790e+05
2,El Salvador,2010,392406.0,2.150010e+05
3,El Salvador,2011,331702.0,1.669509e+05
4,El Salvador,2012,289262.0,1.498491e+05
...,...,...,...,...
3638,Argentina,2007,618127.0,5.117552e+05
3639,Argentina,2006,750699.0,6.072329e+05
3640,Argentina,2005,921325.0,6.745190e+05
3641,Argentina,2003,1803369.0,1.428065e+06


In [21]:
country_encoder=LabelEncoder()
country_encoded = country_encoder.fit_transform(Poverty_2[['country']])
country_encoded_df=pd.DataFrame(country_encoded).rename({0:'country'},axis=1)
X=pd.concat([country_encoded_df['country'],Poverty_2[['year','headcount_international_povline']]],axis=1)

In [22]:
X

Unnamed: 0,country,year,headcount_international_povline
0,0,1996,29167.0
1,42,2009,456897.0
2,42,2010,392406.0
3,42,2011,331702.0
4,42,2012,289262.0
...,...,...,...
3638,3,2007,618127.0
3639,3,2006,750699.0
3640,3,2005,921325.0
3641,3,2003,1803369.0


In [23]:
y=Poverty_2[['total_shortfall_international_povline']]


In [24]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [25]:
num_countries = len(X['country'].unique())
year_shape = (X[['year']].shape[1],)
headcount_shape = (X[['headcount_international_povline']].shape[1],)

train_country = X_train['country']  # First column: country
train_year = X_train['year']     # Second column: year
train_headcount = X_train['headcount_international_povline'] # Third column: headcount

test_country = X_test['country']  # First column: country
test_year = X_test['year']     # Second column: year
test_headcount = X_test['headcount_international_povline'] # Third column: headcount

In [26]:
def create_model(num_countries, num_years, headcount_shape):

    country_input = Input(shape=(1,), name='country')
    year_input = Input(shape=(1,), name='year')
    headcount_input = Input(shape=headcount_shape, name='headcount_international_povline')
    
    country_embedding = Embedding(input_dim=num_countries, output_dim=15)(country_input)
    country_embedding = Flatten()(country_embedding)


    concatenated_inputs = Concatenate()([country_embedding, year_input, headcount_input])

    # Add dense layers for the neural network
    dense_layer = Dense(128, activation='relu')(concatenated_inputs)
    dense_layer = Dense(64, activation='relu')(dense_layer)
    dense_layer = Dense(32, activation='relu')(dense_layer)
    output = Dense(1, activation='linear')(dense_layer)

    # Build the model
    model = Model(inputs=[country_input, year_input, headcount_input], outputs=output)

    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])

    return model


In [27]:
# Create the model
model = create_model(num_countries,year_shape, headcount_shape)

# Print the model summary
model.summary()

In [28]:
model.fit([train_country, train_year, train_headcount], y_train, epochs=1000, batch_size=32, validation_split=0.2,callbacks=EarlyStopping(
                                                                                monitor='val_loss',  
                                                                                 patience=10,
                                                                                restore_best_weights=True  
                                                                                    )
          )


Epoch 1/1000
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - loss: 2257794782724096.0000 - mae: 7899318.0000 - val_loss: 107358034329600.0000 - val_mae: 2806117.7500
Epoch 2/1000
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 180403742703616.0000 - mae: 2885893.2500 - val_loss: 86057630564352.0000 - val_mae: 2546859.2500
Epoch 3/1000
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 218568100675584.0000 - mae: 3101189.5000 - val_loss: 63526064357376.0000 - val_mae: 2093429.0000
Epoch 4/1000
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 969us/step - loss: 177231003385856.0000 - mae: 2867847.7500 - val_loss: 135968833142784.0000 - val_mae: 3091919.2500
Epoch 5/1000
[1m73/73[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - loss: 197191343800320.0000 - mae: 2914056.2500 - val_loss: 129417783279616.0000 - val_mae: 3031899.5000
Epoch 6/1000
[1m73/73[0m [32m━━━━━━━

<keras.src.callbacks.history.History at 0x7ff8a4456210>

In [29]:
# Make predictions
predictions = model.predict([test_country, test_year, test_headcount])


[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [30]:
from sklearn.metrics import mean_squared_error, r2_score

# Evaluate the model
mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, predictions)

print(f"RMSE: {rmse}")
print(f"R-squared: {r2}")

RMSE: 17006098.305724133
R-squared: 0.9063411951065063


# Preparing model for deployment


In [31]:
from tensorflow.keras.models import load_model
import pickle

In [32]:
filename='shortfall_predictor.h5'
model.save(filename)
with open('country_encoder.pkl', 'wb') as file:
    pickle.dump(country_encoder, file)




In [33]:
model = load_model('shortfall_predictor.h5')

with open('country_encoder.pkl', 'rb') as file:
    country_encoder = pickle.load(file)

def predict_total_shortfall(country, year, headcount):
    country_code=country_encoder.transform([country])
    country_code = np.array([country_code]).reshape(-1, 1)
    year = np.array([year]).reshape(-1, 1)
    headcount = np.array([headcount]).reshape(-1, 1)
    
    # Make predictions using the model
    prediction = model.predict([country_code, year, headcount])
    
    return prediction[0][0]  # Return the predicted value



In [34]:
predict_total_shortfall('India',2029,180000000)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step


123183016.0