## DSCI 510 Final Project

## I. Extract and Preprocess Data

### GDP Data by Country 
- Source: World Bank (API)

In [21]:
# GDP Data from api --> show 20000 per page and match with country name data below --> looks good w country
import requests
import json
import pandas as pd

form = 'json'
indicator = 'NY.GDP.PCAP.CD'

url = f'http://api.worldbank.org/v2/country/all/indicator/{indicator}?format={form}&per_page=20000'#&source=2'

# Send a GET request to the API endpoint and store the response
response = requests.get(url)

data = response.json() #Status Code = 200

country_dic = {}
for index in range(len(data[1])):
    if data[1][index]['date'] == '2019':
        country_dic[data[1][index]['country']['value']] = [data[1][index]['date'], data[1][index]['value']]

In [22]:
gdp_dic = {}
for index in range(len(data[1])):
    if data[1][index]['date'] == '2019':
        gdp_dic[data[1][index]['country']['value']] = [data[1][index]['date'], data[1][index]['value']]

In [23]:
gdp_df = pd.DataFrame(gdp_dic.values(), index=gdp_dic.keys(), columns=['Year', 'GDP'])
gdp_df = gdp_df.reset_index().rename(columns={'index': 'Country'})
gdp_df.tail(3)

Unnamed: 0,Country,Year,GDP
263,"Yemen, Rep.",2019,
264,Zambia,2019,1268.120941
265,Zimbabwe,2019,1421.868596


### Vehicle Per Capita Data
- Source: Wikipedia (Webscrape)

In [24]:
from bs4 import BeautifulSoup
import requests
response = requests.get('https://en.wikipedia.org/wiki/List_of_countries_by_vehicles_per_capita')
response.status_code

200

In [25]:
#Parse Wiki Data
soup = BeautifulSoup(response.content, 'html.parser')
table = soup.find_all('table')[0]
headers = [val.text.strip() for val in table.find_all('th')] #Headers

In [26]:
table_rows = table.find_all('tbody')[0].find_all('tr')  #Aggregate Rows
row_data = []
for val in table_rows[1:]:
    for index in range(len(val.find_all('td'))):
        row_data.append(val.find_all('td')[index].text.strip())

In [27]:
#Convert to DataFrame
import pandas as pd
auto_df = pd.DataFrame()
for col_num in range(len(headers)):
    auto_df[headers[col_num]] = row_data[col_num::4]
#sorted(auto_df['Country or region'].unique())
auto_df.head(3) 

Unnamed: 0,Country or region,"Motor vehiclesper 1,000 people",Total,Year
0,Gibraltar,1444,48641,2022[1]
1,Guernsey,1365,86000,2014[2]
2,San Marino,1300,44200,2022[3]


### Emission Data by World Nation
- Source: Kaggle (CSV Download)

In [28]:
emission_df = pd.read_csv('CO2_emission.csv')
emission_df.head(2) #Relational Column: 'Country Name'

Unnamed: 0,Country Name,country_code,Region,Indicator Name,1990,1991,1992,1993,1994,1995,...,2011,2012,2013,2014,2015,2016,2017,2018,2019,2019.1
0,Aruba,ABW,Latin America & Caribbean,CO2 emissions (metric tons per capita),,,,,,,...,,,,,,,,,,
1,Afghanistan,AFG,South Asia,CO2 emissions (metric tons per capita),0.191745,0.167682,0.095958,0.084721,0.075546,0.068468,...,0.296506,0.259295,0.185624,0.146236,0.172897,0.149789,0.131695,0.163295,0.159824,0.159824


## II. Data Cleansing
- After inspecting data its observed that the gdp and co2 data are retrieved from sources that style the country name similar, the aim is to manipulate the auto data (smallest dataset) to match the maximized number of country names in co2 and gdp data (optimizing the # of values in the final merged dataframe)

In [32]:
set_gdp, set_auto, set_co2 = set(gdp_df['Country']), set(auto_df['Country or region']),\
                    set(emission_df['Country Name'])

print(sorted(set_auto.difference(set_gdp, set_co2)))#Elements in auto dataset not in others

['Bahamas', 'Brunei', 'Cape Verde', 'Federated States of Micronesia', 'Gambia', 'Guernsey', 'Hong Kong', 'Ivory Coast', 'Jersey', 'Macau', 'North Korea', 'Palestine', 'Republic of the Congo', 'Saint Kitts and Nevis', 'Saint Lucia', 'Saint Vincent and the Grenadines', 'Scotland', 'Slovakia', 'South America', 'São Tomé and Príncipe', 'Taiwan', 'Wales']


In [33]:
print((set_co2.union(set_gdp))-set_auto) #Elements in gdp+c02 dataset not in auto dataset
#GDP and C02 dataset is singnificantly larger than auto dataset
# and contain regions and terrorities 

{'New Caledonia', 'Sub-Saharan Africa', 'OECD members', 'West Bank and Gaza', 'Congo, Dem. Rep.', 'Latin America & the Caribbean (IDA & IBRD countries)', 'Equatorial Guinea', 'East Asia & Pacific (IDA & IBRD countries)', 'Sub-Saharan Africa (IDA & IBRD countries)', 'Cabo Verde', 'South Asia (IDA & IBRD)', 'Aruba', 'IDA total', 'Not classified', 'St. Vincent and the Grenadines', 'Lao PDR', 'Arab World', 'South Asia', 'Brunei Darussalam', 'IDA & IBRD total', 'Euro area', 'Pre-demographic dividend', 'High income', 'Least developed countries: UN classification', 'Libya', 'Low & middle income', 'Channel Islands', 'Faroe Islands', 'American Samoa', 'Gambia, The', 'Upper middle income', 'Heavily indebted poor countries (HIPC)', 'Africa Western and Central', 'St. Kitts and Nevis', 'Lower middle income', 'Middle income', 'Europe & Central Asia (excluding high income)', 'Virgin Islands (U.S.)', 'East Asia & Pacific', 'Middle East & North Africa (IDA & IBRD countries)', 'St. Martin (French part)'

In [31]:
#Cleansing of primary key column to ease the multiple table relational join process
auto_df['Country or region'] = auto_df['Country or region'].map(lambda col: col.replace('Iran', 
                                'Iran, Islamic Rep.').replace('Venezuela',
                                'Venezuela, RB').replace('Yemen', 'Yemen, Rep.').replace('Russia',
                                'Russian Federation').replace('Egypt',
                                'Egypt, Arab Rep.').replace('Turkey', 'Turkiye').replace('Syria',
                                'Syrian Arab Republic').replace('South Korea', 
                                'Korea, Rep.').replace('Democratic Republic of the Congo',
                                'Congo, Rep.').replace('Kyrgyzstan','Kyrgyz Republic'))

#Execute above 2 cells again to visualize commited changes done here

## III. Relational Join
- 1st Join: `gdp` and `emissions` data on primary key `Country` and `Country Name` respectfully
- 2nd Join: `Merged Dataset` above^ with `automobile` dataset: primary key: `GDP(Country) = Automobile(Country or Region)`
- Data frames developed to perform EDA and analytic tasks for for 2019 data and linear regression modeling

In [34]:
#Initial Merge
merged_df = gdp_df.merge(emission_df, how='inner', left_on='Country', right_on='Country Name')
##Second Merge - main data frame 
df = merged_df.merge(auto_df, how='inner', left_on='Country', right_on='Country or region')
df = df.rename(columns={str(x): f"{str(x)}_emissions" for x in range(1990, 2020)}) #Identify gdp columns
df.head(2)

Unnamed: 0,Country,Year_x,GDP,Country Name,country_code,Region,Indicator Name,1990_emissions,1991_emissions,1992_emissions,...,2015_emissions,2016_emissions,2017_emissions,2018_emissions,2019_emissions,2019.1,Country or region,"Motor vehiclesper 1,000 people",Total,Year_y
0,Afghanistan,2019,500.522664,Afghanistan,AFG,South Asia,CO2 emissions (metric tons per capita),0.191745,0.167682,0.095958,...,0.172897,0.149789,0.131695,0.163295,0.159824,0.159824,Afghanistan,61,2306500,2019[140][141]
1,Albania,2019,5396.215864,Albania,ALB,Europe & Central Asia,CO2 emissions (metric tons per capita),1.819542,1.24281,0.6837,...,1.603775,1.557664,1.788786,1.782739,1.692248,1.692248,Albania,238,740000,2022[67]


#### 2019 Dataframe - to perform Analytic tasks and statistical tests

In [35]:
#Filter desired columns 
df_2019 = df[['Year_x', 'Country', 'GDP', 'Region', '2019_emissions', 'Motor vehiclesper 1,000 people']]
df_2019.head(2) #A cleaner method to view and handle required data for analysis

Unnamed: 0,Year_x,Country,GDP,Region,2019_emissions,"Motor vehiclesper 1,000 people"
0,2019,Afghanistan,500.522664,South Asia,0.159824,61
1,2019,Albania,5396.215864,Europe & Central Asia,1.692248,238


#### Modeling Dataframe - To perform `Linear Regression` on world & country-by-country emissions

In [36]:
#Data transposed through nested looping to display country-country emissions data for years (1990- 2019)
df_modeling = df[[f'{str(x)}_emissions' for x in range(1990,2020)]].copy()
df_modeling['Country'] = df_2019['Country']
df_modeling #Create Dataframe to transpose data for LR Modeling

emissions_ls = []
for row_index in range(len(df_modeling)):
    for col_index in range(len(df_modeling.columns)-1):
        emissions_ls.append([df_modeling.iloc[row_index, col_index], df_modeling.columns[col_index].split('_')[0], 
               df_modeling.Country.unique()[row_index]])
        
emissions_df = pd.DataFrame(emissions_ls, columns = ['Emissions_Capita', 'Year', 'Country'])
emissions_df.head(2)

Unnamed: 0,Emissions_Capita,Year,Country
0,0.191745,1990,Afghanistan
1,0.167682,1991,Afghanistan


## IV. Descriptive Analysis
- Correlation Matrix to measure Pearson correlation of Per capita Emissions & per capita GDP + Automobiles
- View the top and bottom 'n' countries with respect to emissions per capita via class: `world_emissions` along with visualizations
- Aggregate regional statistics for capita emissions (by continent)