In [11]:
##Linear Regression:**
#- Problem Type: Regression
#- Purpose/Benefits: Predict a continuous target variable based on the given features. In the context of your dataset, this could mean predicting a quantitative value (e.g., GDP, life expectancy) based on various input features.

## Recurrent Neural Network (RNN):
#- Problem Type: Regression or Time Series Prediction
#- Purpose/Benefits: RNNs are well-suited for sequential data, making them useful for time series prediction or sequence-to-sequence tasks. In your case, you might utilize an RNN to capture temporal dependencies in your data, predicting future values based on historical information.

In [12]:
import pandas as pd
import numpy as np
import seaborn as sn
import json
import geopandas
from sklearn.model_selection import train_test_split

In [13]:
df = pd.read_csv('./data/data_asia.csv')
print(f'Shape: {df.shape}')
df.head(5)

Shape: (1316, 26)


Unnamed: 0,Total Population,Female Population,Male Population,Birth Rate,Death Rate,Compulsory Education Dur.,Employment in Industry(%),Employment in Agriculture(%),Female Employment in Agriculture(%),Female Employment in Industry(%),...,Renewable Energy Consumption (%),Fossil Fuel Consumption (%),Male life expectancy,Female life expectancy,"School enrollment, primary","School enrollment, tertiary",Primary completion rate,Literacy rate,Year,Country
0,41128771.0,20362329.0,20766442.0,,,9.0,,,,,...,,,,,,,,,2022,Afghanistan
1,40099462.0,19844584.0,20254878.0,35.842,7.344,9.0,,,,,...,17.86,,58.915,65.279,,,,55.929401,2021,Afghanistan
2,38972230.0,19279930.0,19692301.0,36.051,7.113,9.0,18.48131,45.98341,53.91368,26.41834,...,17.58,,59.866,65.432,,10.85844,,,2020,Afghanistan
3,37769499.0,18679089.0,19090409.0,36.466,6.791,9.0,18.33941,44.51263,59.36147,24.07933,...,18.51,,60.619,66.677,110.007507,,86.175522,,2019,Afghanistan
4,36686784.0,18136922.0,18549862.0,36.927,6.981,9.0,18.12015,44.39712,63.66947,23.55313,...,17.96,,59.923,66.458,107.780533,9.96379,88.735519,,2018,Afghanistan


In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 26 columns):
 #   Column                                      Non-Null Count  Dtype  
---  ------                                      --------------  -----  
 0   Total Population                            1316 non-null   float64
 1   Female Population                           1316 non-null   float64
 2   Male Population                             1316 non-null   float64
 3   Birth Rate                                  1269 non-null   float64
 4   Death Rate                                  1269 non-null   float64
 5   Compulsory Education Dur.                   1041 non-null   float64
 6   Employment in Industry(%)                   1267 non-null   float64
 7   Employment in Agriculture(%)                1267 non-null   float64
 8   Female Employment in Agriculture(%)         1267 non-null   float64
 9   Female Employment in Industry(%)            1267 non-null   float64
 10  Unemployment

In [15]:
# Hiển thị tất cả các tên cột hiện có trong DataFrame
print(df.columns)

Index(['Total Population', 'Female Population', 'Male Population',
       'Birth Rate', 'Death Rate', 'Compulsory Education Dur.',
       'Employment in Industry(%)', 'Employment in Agriculture(%)',
       'Female Employment in Agriculture(%)',
       'Female Employment in Industry(%)', 'Unemployment(%)', 'GDP in USD',
       'National Income per Capita', 'Net income from Abroad',
       'Agriculture value added(in USD)',
       'Electric Power Consumption(kWH per capita)',
       'Renewable Energy Consumption (%)', 'Fossil Fuel Consumption (%)',
       'Male life expectancy', 'Female life expectancy ',
       'School enrollment, primary', 'School enrollment, tertiary',
       'Primary completion rate', 'Literacy rate', 'Year', 'Country'],
      dtype='object')


In [16]:
df.drop(['Electric Power Consumption(kWH per capita)','Fossil Fuel Consumption (%)','National Income per Capita','Net income from Abroad'], axis=1, inplace=True)

In [17]:
# check the result
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Total Population                     1316 non-null   float64
 1   Female Population                    1316 non-null   float64
 2   Male Population                      1316 non-null   float64
 3   Birth Rate                           1269 non-null   float64
 4   Death Rate                           1269 non-null   float64
 5   Compulsory Education Dur.            1041 non-null   float64
 6   Employment in Industry(%)            1267 non-null   float64
 7   Employment in Agriculture(%)         1267 non-null   float64
 8   Female Employment in Agriculture(%)  1267 non-null   float64
 9   Female Employment in Industry(%)     1267 non-null   float64
 10  Unemployment(%)                      1311 non-null   float64
 11  GDP in USD                    

In [18]:
print(df.columns)

Index(['Total Population', 'Female Population', 'Male Population',
       'Birth Rate', 'Death Rate', 'Compulsory Education Dur.',
       'Employment in Industry(%)', 'Employment in Agriculture(%)',
       'Female Employment in Agriculture(%)',
       'Female Employment in Industry(%)', 'Unemployment(%)', 'GDP in USD',
       'Agriculture value added(in USD)', 'Renewable Energy Consumption (%)',
       'Male life expectancy', 'Female life expectancy ',
       'School enrollment, primary', 'School enrollment, tertiary',
       'Primary completion rate', 'Literacy rate', 'Year', 'Country'],
      dtype='object')


In [19]:
columns_to_fill = ['Birth Rate', 'Compulsory Education Dur.', 'GDP in USD', 'Death Rate','Employment in Industry(%)','Employment in Agriculture(%)','Female Employment in Agriculture(%)','Female Employment in Industry(%)','Unemployment(%)','Agriculture value added(in USD)','Renewable Energy Consumption (%)','Male life expectancy','Female life expectancy ','School enrollment, primary','School enrollment, tertiary','Primary completion rate']
for column in columns_to_fill:
    median_value = df[column].median()
    df[column].fillna(median_value, inplace=True)
# check the result
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1316 entries, 0 to 1315
Data columns (total 22 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   Total Population                     1316 non-null   float64
 1   Female Population                    1316 non-null   float64
 2   Male Population                      1316 non-null   float64
 3   Birth Rate                           1316 non-null   float64
 4   Death Rate                           1316 non-null   float64
 5   Compulsory Education Dur.            1316 non-null   float64
 6   Employment in Industry(%)            1316 non-null   float64
 7   Employment in Agriculture(%)         1316 non-null   float64
 8   Female Employment in Agriculture(%)  1316 non-null   float64
 9   Female Employment in Industry(%)     1316 non-null   float64
 10  Unemployment(%)                      1316 non-null   float64
 11  GDP in USD                    

In [20]:
df.to_csv('./data/new_data_asia.csv', index=False)

In [21]:
# Separate features (X) and target variable (y)
X = df[['School enrollment, primary', 'School enrollment, tertiary','Primary completion rate', 'Year', 'Country']]
y = df['Primary completion rate']
# X = df.drop(['Country'], axis=1)  # Assuming 'Country' is the target variable
# y = df['Country']

# Split the data into training and temporary sets (80% training, 20% temporary)
X_train_temp, X_temp, y_train_temp, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Split the temporary set into validation and testing sets (50% each)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# Display the shapes of the resulting sets
print(f'Training set: {X_train_temp.shape}')
print(f'Validation set: {X_val.shape}')
print(f'Testing set: {X_test.shape}')

Training set: (1052, 5)
Validation set: (132, 5)
Testing set: (132, 5)
