In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [2]:
Countries = ['Afganistan','Bangladesh','Bhutan','China','India','Indonesia','Iran','Iraq','Israel','Japan','Malaysia','Mongolia','Myanmar','Nepal','North Korea','Pakistan','Singapore','South Korea','Sri Lanka','Syria','Thailand','Vietnam','Yemen']

# Load all the DataFrames into a dictionary
dataframes = {}
for i, file_path in enumerate([
    r'C:\Users\81684\Documents\AI&ML\hunger index\Stunting.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\coefficient-of-variation-in-caloric-consumption-vs-gdp-per-capita.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\share-of-children-underweight.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\undernourished.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\child-mortality-vs-wasting.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\malnutrition-death-rates.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\CV_per_capita_caloric_intake.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\malnutrition-death-rate-vs-gdp-per-capita.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\wasted.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\min_req_calories.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\Food_insecurity.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\Undernourished_vs_gdp.csv',
    r'C:\Users\81684\Documents\AI&ML\hunger index\Stunting_vs_GPD.csv'
]):
    df = pd.read_csv(file_path)
    df = df[(df['Year'] >= 2010) & (df['Year'] <= 2022)].sort_values(by='Year')
    df = df[df['Country'].isin(Countries)]
    dataframes[f'df{i+1}'] = df

# Merge all DataFrames
final_df = pd.DataFrame({'Country': [], 'Year': []})
for key, df in dataframes.items():
    if 'Country' in df.columns:
        df.columns = [f"{col}_{key}" if col not in ['Country', 'Year'] else col for col in df.columns]
        final_df = pd.merge(final_df, df, on=['Country', 'Year'], how='outer')

# Fill missing values with 0
final_df.fillna(pd.NA, inplace=True)

In [3]:
final_df.columns

Index(['Country', 'Code_df1', 'Year', 'Stunted_df1', 'Code_df2',
       'CV_of_Caloric_intake_df2', 'GDP_per_capita_df2', 'Population _df2',
       'Code_df3', 'Prevalence_of_underweight_df3', 'Code_df4',
       'Undernourished_People _df4', 'Code_df5', 'Mortality_Rate_df5',
       'Prevalence of wasting_df5', 'Code_df6', 'Death_Rate_df6', 'Code_df7',
       'CV_caloric_intake_df7', 'Code_df8', 'Death_Rate_df8',
       'GDP per capita_df8', 'Population_df8', 'Code_df9',
       'Number of wasted children_df9', 'Code_df10',
       'Minimum dietary energy requirement  (kcal/cap/day)_df10', 'Code_df11',
       'Number of moderately or severely food insecure people (million)_df11',
       'Code_df12', ' Prevalence of undernourishment_df12',
       'GDP per capita_df12', 'Code_df13', 'Stunted_df13',
       'GDP per capita_df13', 'Population _df13'],
      dtype='object')

In [4]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 36 columns):
 #   Column                                                                Non-Null Count  Dtype  
---  ------                                                                --------------  -----  
 0   Country                                                               286 non-null    object 
 1   Code_df1                                                              57 non-null     object 
 2   Year                                                                  286 non-null    int64  
 3   Stunted_df1                                                           57 non-null     float64
 4   Code_df2                                                              264 non-null    object 
 5   CV_of_Caloric_intake_df2                                              220 non-null    float64
 6   GDP_per_capita_df2                                                    228 non-null    float64
 7  

In [5]:
final_df.shape

(286, 36)

In [6]:
final_df.head()

Unnamed: 0,Country,Code_df1,Year,Stunted_df1,Code_df2,CV_of_Caloric_intake_df2,GDP_per_capita_df2,Population _df2,Code_df3,Prevalence_of_underweight_df3,...,Minimum dietary energy requirement (kcal/cap/day)_df10,Code_df11,Number of moderately or severely food insecure people (million)_df11,Code_df12,Prevalence of undernourishment_df12,GDP per capita_df12,Code_df13,Stunted_df13,GDP per capita_df13,Population _df13
0,China,CHN,2010,9.4,CHN,0.2,8884.588,1348191000.0,CHN,3.4,...,1888,,,CHN,2.5,8884.588,CHN,9.4,8884.588,1348191000.0
1,Iran,IRN,2010,6.8,IRN,0.27,15099.46,75373860.0,IRN,4.1,...,1861,,,IRN,7.1,15099.46,IRN,6.8,15099.46,75373860.0
2,Mongolia,MNG,2010,15.3,MNG,0.25,7518.831,2702527.0,MNG,3.3,...,1823,,,MNG,16.0,7518.831,MNG,15.3,7518.831,2702527.0
3,Vietnam,VNM,2010,22.7,VNM,0.28,6324.526,87411020.0,VNM,11.7,...,1782,,,VNM,9.9,6324.526,VNM,22.7,6324.526,87411020.0
4,Japan,JPN,2010,7.1,JPN,0.17,38069.957,128105400.0,JPN,3.4,...,1860,,,JPN,2.8,38069.957,JPN,7.1,38069.957,128105400.0


In [7]:
final_df = final_df.filter(items = ['Country','Year','Stunted_df1','CV_of_Caloric_intake_df2','GDP_per_capita_df2','Population _df2','Prevalence_of_underweight_df3','Undernourished_People _df4','Mortality_Rate_df5','Prevalence of wasting_df5','Death_Rate_df6','Number of wasted children_df9','Minimum dietary energy requirement (kcal/cap/day)_df10','Number of moderately or severely food insecure people (million)_df11','Prevalence of undernourishment_df12'])

In [8]:
new_columns = {'Stunted_df1': 'Stunted', 'CV_of_Caloric_intake_df2': 'Caloric_intake', 'GDP_per_capita_df2': 'GDP_per_capita', 'Population _df2': 'Population', 'Prevalence_of_underweight_df3': 'Underweight', 'Undernourished_People _df4': 'Undernourished_People','Mortality_Rate_df5': 'Mortality_Rate', 'Prevalence of wasting_df5': 'Prevalence of wasting','Death_Rate_df6': 'Death_Rate','Number of wasted children_df9': 'Number_of_wasted_children', 'Minimum dietary energy requirement (kcal/cap/day)_df10': 'Minimum_dietary_energy_requirement','Number of moderately or severely food insecure people (million)_df11': 'Number_of_food_insecure_people', 'Prevalence of undernourishment_df12': 'Undernourishment'}
final_df = final_df.rename(columns=new_columns)

In [9]:
final_df.head()

Unnamed: 0,Country,Year,Stunted,Caloric_intake,GDP_per_capita,Population,Underweight,Undernourished_People,Mortality_Rate,Prevalence of wasting,Death_Rate,Number_of_wasted_children,Number_of_food_insecure_people
0,China,2010,9.4,0.2,8884.588,1348191000.0,3.4,,1.575519,2.3,1.07,1974271.0,
1,Iran,2010,6.8,0.27,15099.46,75373860.0,4.1,5300000.0,1.931051,4.0,0.47,243797.0,
2,Mongolia,2010,15.3,0.25,7518.831,2702527.0,3.3,400000.0,2.628552,1.6,0.02,4637.0,
3,Vietnam,2010,22.7,0.28,6324.526,87411020.0,11.7,8700000.0,2.304898,4.1,0.23,292305.0,
4,Japan,2010,7.1,0.17,38069.957,128105400.0,3.4,3600000.0,0.319937,2.3,0.37,124400.0,


In [10]:
whole_number_columns = [
    'Population', 'Undernourished_People', 'Number_of_wasted_children', 'Number_of_food_insecure_people'
]
ratio_columns = [col for col in final_df.columns if col not in whole_number_columns + ['Country', 'Year']]

# Helper function to fill NaN values with country mean
def fill_na_with_country_mean(df, column):
    df[column] = df.groupby('Country')[column].transform(lambda x: x.fillna(x.mean()))
    return df

# Helper function to fill NaN values with country max
def fill_na_with_country_max(df, column):
    df[column] = df.groupby('Country')[column].transform(lambda x: x.fillna(x.max()))
    return df

# Apply mean filling for ratio columns
for col in ratio_columns:
    final_df = fill_na_with_country_mean(final_df, col)
    final_df[col].fillna(final_df[col].mean(), inplace=True)

# Apply max filling for whole number columns
for col in whole_number_columns:
    final_df = fill_na_with_country_max(final_df, col)
    final_df[col].fillna(final_df[col].max(), inplace=True)

final_df.head()


Unnamed: 0,Country,Year,Stunted,Caloric_intake,GDP_per_capita,Population,Underweight,Undernourished_People,Mortality_Rate,Prevalence of wasting,Death_Rate,Number_of_wasted_children,Number_of_food_insecure_people
0,China,2010,9.4,0.2,8884.588,1348191000.0,3.4,233900000.0,1.575519,2.3,1.07,1974271.0,97900000.0
1,Iran,2010,6.8,0.27,15099.46,75373860.0,4.1,5300000.0,1.931051,4.0,0.47,243797.0,39200000.0
2,Mongolia,2010,15.3,0.25,7518.831,2702527.0,3.3,400000.0,2.628552,1.6,0.02,4637.0,200000.0
3,Vietnam,2010,22.7,0.28,6324.526,87411020.0,11.7,8700000.0,2.304898,4.1,0.23,292305.0,8700000.0
4,Japan,2010,7.1,0.17,38069.957,128105400.0,3.4,3600000.0,0.319937,2.3,0.37,124400.0,5500000.0


In [11]:
#final_df.eq(0).sum()
final_df.isnull().sum()

Country                           0
Year                              0
Stunted                           0
Caloric_intake                    0
GDP_per_capita                    0
Population                        0
Underweight                       0
Undernourished_People             0
Mortality_Rate                    0
Prevalence of wasting             0
Death_Rate                        0
Number_of_wasted_children         0
Number_of_food_insecure_people    0
dtype: int64

.

In [12]:
final_df['Year'].unique()

array([2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018, 2019, 2020,
       2021, 2022], dtype=int64)

In [13]:
final_df.reset_index(inplace=True)

In [14]:
final_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 286 entries, 0 to 285
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   index                           286 non-null    int64  
 1   Country                         286 non-null    object 
 2   Year                            286 non-null    int64  
 3   Stunted                         286 non-null    float64
 4   Caloric_intake                  286 non-null    float64
 5   GDP_per_capita                  286 non-null    float64
 6   Population                      286 non-null    float64
 7   Underweight                     286 non-null    float64
 8   Undernourished_People           286 non-null    float64
 9   Mortality_Rate                  286 non-null    float64
 10  Prevalence of wasting           286 non-null    float64
 11  Death_Rate                      286 non-null    float64
 12  Number_of_wasted_children       286 

In [15]:
specified_columns = ['Undernourished_People', 'Prevalence of wasting', 'Stunted', 'Mortality_Rate']
scaler = MinMaxScaler()
final_df[specified_columns] = scaler.fit_transform(final_df[specified_columns])

In [16]:
final_df['GHI'] = (1/3 * final_df['Undernourished_People'] +
                    1/6 * final_df['Prevalence of wasting'] +
                    1/6 * final_df['Stunted'] +
                    1/3 * final_df['Mortality_Rate'])

# Scale the GHI to a range of 0 to 100
final_df['GHI'] *= 100


In [17]:
# Create a function to categorize GHI into severity levels
def categorize_ghi(GHI):
    if GHI < 10:
        return 'Low'
    elif GHI < 20:
        return 'Moderate'
    elif GHI < 35:
        return 'Serious'
    elif GHI < 50:
        return 'Alarming'
    else:
        return 'Extremely Alarming'

# Apply the function to create the GHI_Severity_lvl column
final_df['GHI_Severity_lvl'] = final_df['GHI'].apply(categorize_ghi)


In [18]:
final_df

Unnamed: 0,index,Country,Year,Stunted,Caloric_intake,GDP_per_capita,Population,Underweight,Undernourished_People,Mortality_Rate,Prevalence of wasting,Death_Rate,Number_of_wasted_children,Number_of_food_insecure_people,GHI,GHI_Severity_lvl
0,0,China,2010,0.110048,0.200000,8884.588000,1.348191e+09,3.400000,1.000000,0.160531,0.068627,1.070,1974271.0,97900000.0,41.662282,Alarming
1,1,Iran,2010,0.047847,0.270000,15099.460000,7.537386e+07,4.100000,0.021823,0.202292,0.151961,0.470,243797.0,39200000.0,10.800608,Moderate
2,2,Mongolia,2010,0.251196,0.250000,7518.831000,2.702527e+06,3.300000,0.000856,0.284220,0.034314,0.020,4637.0,200000.0,14.261021,Moderate
3,3,Vietnam,2010,0.428230,0.280000,6324.526000,8.741102e+07,11.700000,0.036371,0.246203,0.156863,0.230,292305.0,8700000.0,19.170703,Moderate
4,4,Japan,2010,0.055024,0.170000,38069.957000,1.281054e+08,3.400000,0.014549,0.013050,0.068627,0.370,124400.0,5500000.0,2.980817,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,281,Mongolia,2022,0.141148,0.245455,10739.575167,3.347782e+06,1.875000,0.000856,0.204216,0.013480,0.014,4637.0,200000.0,9.412863,Low
282,282,Pakistan,2022,0.891547,0.247273,4638.307192,2.314021e+08,28.766667,0.182285,0.858692,0.486928,4.082,4167102.0,97900000.0,57.673807,Extremely Alarming
283,283,North Korea,2022,0.447368,0.260000,18893.909880,2.597191e+07,12.250000,0.049636,0.226835,0.115196,1.983,63901.0,97900000.0,18.591772,Moderate
284,284,Nepal,2022,0.751196,0.261818,3308.548642,3.003499e+07,27.575000,0.011125,0.390936,0.496324,6.308,352224.0,11200000.0,34.194037,Serious


In [19]:
columns_to_revert = ['Undernourished_People', 'Prevalence of wasting', 'Stunted', 'Mortality_Rate']

#  revert scaled columns
final_df[columns_to_revert] = scaler.inverse_transform(final_df[columns_to_revert])


In [20]:
final_df

Unnamed: 0,index,Country,Year,Stunted,Caloric_intake,GDP_per_capita,Population,Underweight,Undernourished_People,Mortality_Rate,Prevalence of wasting,Death_Rate,Number_of_wasted_children,Number_of_food_insecure_people,GHI,GHI_Severity_lvl
0,0,China,2010,9.400000,0.200000,8884.588000,1.348191e+09,3.400000,233900000.0,1.575519,2.300000,1.070,1974271.0,97900000.0,41.662282,Alarming
1,1,Iran,2010,6.800000,0.270000,15099.460000,7.537386e+07,4.100000,5300000.0,1.931051,4.000000,0.470,243797.0,39200000.0,10.800608,Moderate
2,2,Mongolia,2010,15.300000,0.250000,7518.831000,2.702527e+06,3.300000,400000.0,2.628552,1.600000,0.020,4637.0,200000.0,14.261021,Moderate
3,3,Vietnam,2010,22.700000,0.280000,6324.526000,8.741102e+07,11.700000,8700000.0,2.304898,4.100000,0.230,292305.0,8700000.0,19.170703,Moderate
4,4,Japan,2010,7.100000,0.170000,38069.957000,1.281054e+08,3.400000,3600000.0,0.319937,2.300000,0.370,124400.0,5500000.0,2.980817,Low
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
281,281,Mongolia,2022,10.700000,0.245455,10739.575167,3.347782e+06,1.875000,400000.0,1.947433,1.175000,0.014,4637.0,200000.0,9.412863,Low
282,282,Pakistan,2022,42.066667,0.247273,4638.307192,2.314021e+08,28.766667,42800000.0,7.519345,10.833333,4.082,4167102.0,97900000.0,57.673807,Extremely Alarming
283,283,North Korea,2022,23.500000,0.260000,18893.909880,2.597191e+07,12.250000,11800000.0,2.140000,3.250000,1.983,63901.0,97900000.0,18.591772,Moderate
284,284,Nepal,2022,36.200000,0.261818,3308.548642,3.003499e+07,27.575000,2800000.0,3.537084,11.025000,6.308,352224.0,11200000.0,34.194037,Serious


In [21]:
final_df['GHI_Severity_lvl'].value_counts()

GHI_Severity_lvl
Alarming              80
Serious               66
Moderate              56
Extremely Alarming    50
Low                   34
Name: count, dtype: int64

 # Save the final table to a CSV file
final_df.to_csv(r'C:\Users\81684\Documents\AI&ML\final_table.csv', index=False)

In [22]:
data = final_df.filter(items = ['Undernourished_People', 'Prevalence of wasting', 'Stunted', 'Mortality_Rate','GHI'])

In [23]:
specified_columns = ['Prevalence of wasting', 'Stunted', 'Mortality_Rate']
scaler = MinMaxScaler()
data[specified_columns] = scaler.fit_transform(data[specified_columns])

In [24]:
data.head()

Unnamed: 0,Undernourished_People,Prevalence of wasting,Stunted,Mortality_Rate,GHI
0,233900000.0,0.068627,0.110048,0.160531,41.662282
1,5300000.0,0.151961,0.047847,0.202292,10.800608
2,400000.0,0.034314,0.251196,0.28422,14.261021
3,8700000.0,0.156863,0.42823,0.246203,19.170703
4,3600000.0,0.068627,0.055024,0.01305,2.980817


In [25]:
# Define features and target
X = final_df[['Undernourished_People', 'Prevalence of wasting', 'Stunted', 'Mortality_Rate']]
y = final_df['GHI']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Dictionary to store models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree Regressor': DecisionTreeRegressor(random_state=42),
    'Random Forest Regressor': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting Regressor': GradientBoostingRegressor(n_estimators=100, random_state=42)
}

In [31]:
# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    
    print(f"{model_name}:")
    print(f"  Mean Squared Error (MSE): {mse:.3f}")
    print(f"  Accuracy: {r2:.3f}")
    print("\n")

Linear Regression:
  Mean Squared Error (MSE): 0.000
  Accuracy: 1.000


Decision Tree Regressor:
  Mean Squared Error (MSE): 10.675
  Accuracy: 0.969


Random Forest Regressor:
  Mean Squared Error (MSE): 4.762
  Accuracy: 0.986


Gradient Boosting Regressor:
  Mean Squared Error (MSE): 3.964
  Accuracy: 0.989


