In [360]:
import numpy as np
import pandas as pd

In [361]:
# 280 countries/territories total

In [362]:
# List of features completed
# 1. GDP
# 2. Internet Speed
# 3. Consumption of Pure Alcohol
# 4. Intentional Homicide Victims
# 5. Military Expenditures

In [363]:
# CREATE A MATRIX OF THE NAME OF COUNTRIES
def get_countries(filename):
    df_country = pd.read_csv(filename)
    return df_country.values

In [364]:
# EXPAND THE TABLE FOR ACCUMULATING THE NEXT FEATURE
def expand_table(table):
    next_factor = np.empty(table.shape[0])
    next_factor[:] = np.NaN
    
    return np.column_stack((table, next_factor))

In [365]:
# FIND THE INDEX OF THE NAME OF THE COUNTRY AT THE TABLE
def find_index(country_matrix, name):
    for i in range(country_matrix.shape[0]):
        for j in range(country_matrix.shape[1]):
            if country_matrix[i][j] == name:
                return i
            
    return -1

In [366]:
# FILL THE TABLE WITH THE SPECIFIED FEATURE
def fill_table(table, feature_table, country_matrix):
    array_index = np.array([])
    
    for i in range(feature_table.shape[0]):
        index = find_index(country_matrix, feature_table[i][0])
        
        if index == -1:
            print(f'COUNTRY NOT FOUND: {feature_table[i][0]}')
        else:
            table[index][-1] = feature_table[i][1]
            array_index = np.hstack((array_index,index))
    
    #Verification state
    print(f'# Index found and # Unique index found: {array_index.shape[0]} and {np.unique(array_index).shape[0]}')
    #print(np.sort(array_index))
    #print(np.unique(array_index))
    
    return table

In [367]:
# EXTRACT THE DATA WITH THE DESIRED FEATURE
# Returns a table: [Country, Values of the feature]
def extracted_data(filename, starting_row, column_country, column_feature, is_comma):
    df = pd.read_csv(filename)
    df = df.iloc[starting_row:, [column_country, column_feature]]
    
    countries = df.iloc[:,0].values

    # Clean the feature data
    feature = df.iloc[:,1].values
    
    if is_comma:
        for i in range(feature.shape[0]):
            feature[i] = float(str(feature[i]).replace(',','.'))
    
    return np.column_stack((countries, feature))

In [368]:
# PRINCIPAL FUNCTION THAT ADDS A NEW FEATURE TO THE TABLE
# Expand the table, extract the data from csv file and fill the table
# array_indexes: [starting_row, column_country, column_feature] (Read description of function "extracted_data")
def update_table(filename, array_indexes, is_comma, table, country_matrix):
    table = expand_table(table)
    feature_table = extracted_data(filename, array_indexes[0], array_indexes[1], array_indexes[2], is_comma)
    table = fill_table(table, feature_table, country_matrix)
    
    return table

In [369]:
country_matrix = get_countries("countries.csv")
print(f'Number of countries/territories: {country_matrix.shape[0]}')
countries = country_matrix[:,0]
table = countries

Number of countries/territories: 280


In [370]:
# Features 1 to 5
table = update_table("CSV Raw Data/1.csv", [1, 0, 5], True, table, country_matrix)
table = update_table("CSV Raw Data/2.csv", [0, 1, 2], False, table, country_matrix)
table = update_table("CSV Raw Data/3.csv", [0, 0, 2], False, table, country_matrix)
table = update_table("CSV Raw Data/4.csv", [1, 0, 3], False, table, country_matrix)
table = update_table("CSV Raw Data/5.csv", [1, 1, 3], False, table, country_matrix)

COUNTRY NOT FOUND: World
# Index found and # Unique index found: 215 and 215
# Index found and # Unique index found: 100 and 100
# Index found and # Unique index found: 191 and 191
# Index found and # Unique index found: 195 and 194
# Index found and # Unique index found: 15 and 15


In [371]:
#################################################################################
# FINAL STEPS: CONVERTING TO CSV FILE

final_table = pd.DataFrame(table, columns=['COUNTRY', 'GDP', 'Internet Speed', 'Consumption of Pure Alcohol',
                                           'Intentional Homicide Victims', 'Military Expenditures'])
# Convert the features to numeric numbers (float64)

# Features 1 to 5
final_table["GDP"] = pd.to_numeric(final_table["GDP"])
final_table["Internet Speed"] = pd.to_numeric(final_table["Internet Speed"])
final_table["Consumption of Pure Alcohol"] = pd.to_numeric(final_table["Consumption of Pure Alcohol"])
final_table["Intentional Homicide Victims"] = pd.to_numeric(final_table["Intentional Homicide Victims"])
final_table["Military Expenditures"] = pd.to_numeric(final_table["Military Expenditures"])

print(final_table.iloc[:,-1].count())
final_table.head()

15


Unnamed: 0,COUNTRY,GDP,Internet Speed,Consumption of Pure Alcohol,Intentional Homicide Victims,Military Expenditures
0,Afghanistan,470.0,,0.2,6.7,
1,Africa,,,,,
2,Albania,5.303,45.25,7.5,2.3,
3,Algeria,3.976,,0.9,1.4,
4,American Samoa,,,,,


In [372]:
final_table.to_csv("final_table.csv", encoding='utf-8', index=False)

In [373]:
# VERIFY THE DATA TYPES (FLOAT64 FOR EVERY FEATURE)
data=pd.read_csv('final_table.csv', index_col=0)
data.dtypes

GDP                             float64
Internet Speed                  float64
Consumption of Pure Alcohol     float64
Intentional Homicide Victims    float64
Military Expenditures           float64
dtype: object

In [374]:
data.head()

Unnamed: 0_level_0,GDP,Internet Speed,Consumption of Pure Alcohol,Intentional Homicide Victims,Military Expenditures
COUNTRY,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Afghanistan,470.0,,0.2,6.7,
Africa,,,,,
Albania,5.303,45.25,7.5,2.3,
Algeria,3.976,,0.9,1.4,
American Samoa,,,,,
