In [12]:
import numpy as np
import pandas as pd

In [13]:
# 280 countries/territories total

In [14]:
# List of features completed
# 1. GDP
# 2. Internet Speed
# 3. Consumption of Pure Alcohol
# 4. Intentional Homicide Victims
# 5. Military Expenditures

# 6. Human Development Index
# 7. Democracy Index
# 8. Tertiary Education
# 9. Importance of Religion (in %)
# 10. % of Christians

# 11. % of Muslims
# 12. % of Buddhists
# 13. % of Jews
# 14. Under-Five Mortality
# 15. Age of Criminal Responsability

# 16. Minimal Wage
# 17. External Debt (% of GDP)
# 18. Gini (in %)
# 19. Health Expenditure
# 20. Suicide Rate

In [107]:
# CREATE A MATRIX OF THE NAME OF COUNTRIES
def get_countries(filename):
    df_country = pd.read_csv(filename)
    return df_country.values

In [108]:
# EXPAND THE TABLE FOR ACCUMULATING THE NEXT FEATURE
def expand_table(table):
    next_factor = np.empty(table.shape[0])
    next_factor[:] = np.NaN
    
    return np.column_stack((table, next_factor))

In [109]:
# FIND THE INDEX OF THE NAME OF THE COUNTRY AT THE TABLE
def find_index(country_matrix, name):
    for i in range(country_matrix.shape[0]):
        for j in range(country_matrix.shape[1]):
            if country_matrix[i][j] == name:
                return i
            
    return -1

In [110]:
# FILL THE TABLE WITH THE SPECIFIED FEATURE
def fill_table(table, feature_table, country_matrix):
    array_index = np.array([])
    
    for i in range(feature_table.shape[0]):
        index = find_index(country_matrix, feature_table[i][0])
        
        if index == -1:
            print(f'COUNTRY/TERRITORY NOT FOUND: {feature_table[i][0]}')
        else:
            table[index][-1] = feature_table[i][1]
            array_index = np.hstack((array_index,index))
    
    #Verification state
    print(f'# Index found and # Unique index found: {array_index.shape[0]} and {np.unique(array_index).shape[0]}')
    #print(np.sort(array_index))
    #print(np.unique(array_index))
    
    return table

In [111]:
# EXTRACT THE DATA WITH THE DESIRED FEATURE
# Returns a table: [Country, Values of the feature]
def extracted_data(filename, starting_row, column_country, column_feature):
    df = pd.read_csv(filename)
    df = df.iloc[starting_row:, [column_country, column_feature]]
    
    countries = df.iloc[:,0].values

    # Clean the feature data
    feature = df.iloc[:,1].values
    
    return np.column_stack((countries, feature))

In [112]:
# PRINCIPAL FUNCTION THAT ADDS A NEW FEATURE TO THE TABLE
# Expand the table, extract the data from csv file and fill the table
# array_indexes: [starting_row, column_country, column_feature] (Read description of function "extracted_data")
def update_table(filename, array_indexes, table, country_matrix):
    table = expand_table(table)
    feature_table = extracted_data(filename, array_indexes[0], array_indexes[1], array_indexes[2])
    table = fill_table(table, feature_table, country_matrix)
    
    return table

In [140]:
country_matrix = get_countries("countries.csv")
print(f'Number of countries/territories: {country_matrix.shape[0]}')
countries = country_matrix[:,0]
table = countries

print('\n FEATURES 1 TO 5')
table = update_table("CSV Raw Data/1.csv", [1, 0, 5], table, country_matrix)
table = update_table("CSV Raw Data/2.csv", [0, 1, 2], table, country_matrix)
table = update_table("CSV Raw Data/3.csv", [0, 0, 2], table, country_matrix)
table = update_table("CSV Raw Data/4.csv", [1, 0, 3], table, country_matrix)
table = update_table("CSV Raw Data/5.csv", [1, 1, 3], table, country_matrix)

print('\n FEATURES 6 TO 10')
table = update_table("CSV Raw Data/6.csv", [1, 2, 3], table, country_matrix)
table = update_table("CSV Raw Data/7.csv", [0, 2, 4], table, country_matrix)
table = update_table("CSV Raw Data/8.csv", [1, 0, 1], table, country_matrix)
table = update_table("CSV Raw Data/9.csv", [0, 1, 2], table, country_matrix)
table = update_table("CSV Raw Data/10.csv", [0, 0, 2],table, country_matrix)

print('\n FEATURES 11 TO 15')
table = update_table("CSV Raw Data/11.csv", [0, 0, 3],table, country_matrix)
table = update_table("CSV Raw Data/12.csv", [1, 0, 2],table, country_matrix)
table = update_table("CSV Raw Data/13.csv", [1, 0, 2],table, country_matrix)
table = update_table("CSV Raw Data/14.csv", [0, 0, 1],table, country_matrix)
table = update_table("CSV Raw Data/15.csv", [0, 0, 1],table, country_matrix)

print('\n FEATURES 16 TO 20')
table = update_table("CSV Raw Data/16.csv", [1, 0, 2],table, country_matrix)
table = update_table("CSV Raw Data/17.csv", [0, 1, 5],table, country_matrix)
table = update_table("CSV Raw Data/18.csv", [2, 0, 5],table, country_matrix)
table = update_table("CSV Raw Data/19.csv", [0, 0, 3],table, country_matrix)
table = update_table("CSV Raw Data/20.csv", [0, 0, 1],table, country_matrix)

Number of countries/territories: 277

 FEATURES 1 TO 5
# Index found and # Unique index found: 215 and 215
# Index found and # Unique index found: 100 and 100
# Index found and # Unique index found: 191 and 191
# Index found and # Unique index found: 195 and 194
# Index found and # Unique index found: 15 and 15

 FEATURES 6 TO 10
# Index found and # Unique index found: 189 and 189
# Index found and # Unique index found: 167 and 167
# Index found and # Unique index found: 44 and 44
# Index found and # Unique index found: 145 and 145
# Index found and # Unique index found: 195 and 195

 FEATURES 11 TO 15
# Index found and # Unique index found: 232 and 232
# Index found and # Unique index found: 179 and 179
# Index found and # Unique index found: 111 and 111
# Index found and # Unique index found: 193 and 193
# Index found and # Unique index found: 141 and 141

 FEATURES 16 TO 20
# Index found and # Unique index found: 202 and 202
# Index found and # Unique index found: 210 and 210
# Inde

In [142]:
#################################################################################
# FINAL STEPS: CONVERTING TO CSV FILE

final_table = pd.DataFrame(table, columns=['COUNTRY', 'GDP', 'Internet Speed', 'Consumption of Pure Alcohol',
    'Intentional Homicide Victims', 'Military Expenditures', 'Human Development Index', 'Democracy Index',
    'Tertiary Education', 'Importance of Religion (in %)', '% of Christians', '% of Muslims', '% of Buddhists',
    '% of Jews', 'Under-Five Mortality', 'Age of Criminal Responsability', 'Minimal Wage', 'External Debt (% of GDP)', 
    'Gini (in %)', 'Health Expenditure', 'Suicide Rate'])

# Convert the features to numeric numbers (float64)

# Features 1 to 5
final_table["GDP"] = pd.to_numeric(final_table["GDP"])
final_table["Internet Speed"] = pd.to_numeric(final_table["Internet Speed"])
final_table["Consumption of Pure Alcohol"] = pd.to_numeric(final_table["Consumption of Pure Alcohol"])
final_table["Intentional Homicide Victims"] = pd.to_numeric(final_table["Intentional Homicide Victims"])
final_table["Military Expenditures"] = pd.to_numeric(final_table["Military Expenditures"])

# Features 6 to 10
final_table["Human Development Index"] = pd.to_numeric(final_table["Human Development Index"])
final_table["Democracy Index"] = pd.to_numeric(final_table["Democracy Index"])
final_table["Tertiary Education"] = pd.to_numeric(final_table["Tertiary Education"])
final_table["Importance of Religion (in %)"] = pd.to_numeric(final_table["Importance of Religion (in %)"])
final_table["% of Christians"] = pd.to_numeric(final_table["% of Christians"])

# Features 11 to 15
final_table["% of Muslims"] = pd.to_numeric(final_table["% of Muslims"])
final_table["% of Buddhists"] = pd.to_numeric(final_table["% of Buddhists"])
final_table["% of Jews"] = pd.to_numeric(final_table["% of Jews"])
final_table["Under-Five Mortality"] = pd.to_numeric(final_table["Under-Five Mortality"])
final_table["Age of Criminal Responsability"] = pd.to_numeric(final_table["Age of Criminal Responsability"])

# Features 16 to 20
final_table["Minimal Wage"] = pd.to_numeric(final_table["Minimal Wage"])
final_table["External Debt (% of GDP)"] = pd.to_numeric(final_table["External Debt (% of GDP)"])
final_table["Gini (in %)"] = pd.to_numeric(final_table["Gini (in %)"])
final_table["Health Expenditure"] = pd.to_numeric(final_table["Health Expenditure"])
final_table["Suicide Rate"] = pd.to_numeric(final_table["Suicide Rate"])

print(final_table.iloc[:,-1].count())
final_table.head()

184


Unnamed: 0,COUNTRY,GDP,Internet Speed,Consumption of Pure Alcohol,Intentional Homicide Victims,Military Expenditures,Human Development Index,Democracy Index,Tertiary Education,Importance of Religion (in %),...,% of Muslims,% of Buddhists,% of Jews,Under-Five Mortality,Age of Criminal Responsability,Minimal Wage,External Debt (% of GDP),Gini (in %),Health Expenditure,Suicide Rate
0,Afghanistan,470.0,,0.2,6.7,,0.511,2.85,,97.0,...,99.7,0.1,,60.3,7.0,866.0,7.0,,186.0,6.0
1,Albania,5303.0,45.25,7.5,2.3,,0.795,6.08,,50.0,...,58.8,0.1,,9.7,14.0,3277.0,66.0,33.2,697.0,3.7
2,Algeria,3976.0,,0.9,1.4,,0.748,3.77,,95.0,...,99.7,0.1,,23.3,13.0,2010.0,2.0,27.6,963.0,2.6
3,American Samoa,,,,,,,,,,...,0.1,0.3,,,,,,,,
4,Andorra,40887.0,191.23,11.3,0.0,,0.868,,,,...,2.6,0.1,,3.0,12.0,14397.0,,,3607.0,


In [143]:
final_table.to_csv("final_table.csv", encoding='utf-8', index=False)

In [144]:
# VERIFY THE DATA TYPES (FLOAT64 FOR EVERY FEATURE)
data=pd.read_csv('final_table.csv', index_col=0)
data.dtypes

GDP                               float64
Internet Speed                    float64
Consumption of Pure Alcohol       float64
Intentional Homicide Victims      float64
Military Expenditures             float64
Human Development Index           float64
Democracy Index                   float64
Tertiary Education                float64
Importance of Religion (in %)     float64
% of Christians                   float64
% of Muslims                      float64
% of Buddhists                    float64
% of Jews                         float64
Under-Five Mortality              float64
Age of Criminal Responsability    float64
Minimal Wage                      float64
External Debt (% of GDP)          float64
Gini (in %)                       float64
Health Expenditure                float64
Suicide Rate                      float64
dtype: object

In [145]:
data.head()

Unnamed: 0_level_0,GDP,Internet Speed,Consumption of Pure Alcohol,Intentional Homicide Victims,Military Expenditures,Human Development Index,Democracy Index,Tertiary Education,Importance of Religion (in %),% of Christians,% of Muslims,% of Buddhists,% of Jews,Under-Five Mortality,Age of Criminal Responsability,Minimal Wage,External Debt (% of GDP),Gini (in %),Health Expenditure,Suicide Rate
COUNTRY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
Afghanistan,470.0,,0.2,6.7,,0.511,2.85,,97.0,0.02,99.7,0.1,,60.3,7.0,866.0,7.0,,186.0,6.0
Albania,5303.0,45.25,7.5,2.3,,0.795,6.08,,50.0,17.0,58.8,0.1,,9.7,14.0,3277.0,66.0,33.2,697.0,3.7
Algeria,3976.0,,0.9,1.4,,0.748,3.77,,95.0,0.01,99.7,0.1,,23.3,13.0,2010.0,2.0,27.6,963.0,2.6
American Samoa,,,,,,,,,,98.3,0.1,0.3,,,,,,,,
Andorra,40887.0,191.23,11.3,0.0,,0.868,,,,89.5,2.6,0.1,,3.0,12.0,14397.0,,,3607.0,
