In [232]:
import pandas as pd
import numpy as np

*Time variant indicators are downloaded from WDI for each period. I extracted the zip files with raw data in the 'raw' subfolder of the 'data' folder. Without changing the names of the raw data file, I just added the `year name` at the tail of the original name extracted from WDI, so that I can keep track. Example: `dc261827-d380-414a-b6ff-55a77579a878_Data.csv` was the name for the data for 2000. I changed the name like `dc261827-d380-414a-b6ff-55a77579a878_Data2000.csv`.*

**Preparing the time variant data takes the following steps:**
>1. Reading the raw data file for a single year.
>2. Pruning the unnecessary rows with formulas from the tail of the data
>3. Renaming the columns
>4. Replacing the '..' values with NaN
>5. Converting all columns to numeric from object type except the 'iso_o' and 'country'
>6. Finding the mean GDP of that year and imputing that to missing rows
>7. Creating a column with log transformed GDP value of that year
>8. Finding the common countries those are present in the trade data and also in this time variant data
>9. Only keeping the common countries for each year.
>10. Saving the cleaned data as a csv file named like `attributeYEAR.csv`
>11. Doing the above steps for every year from 2000 to 2023, having saved 24 attribute files in the cleaned folder
>12. Finally concatenating all the dataframes on columns and sorting those with country and year for gravity analysis
>13. Saving the concataned data as `attribute2000_2023.csv` in the cleaned folder

In [233]:
at1999 = pd.read_csv('../data/raw/416d66fa-850c-4c72-a6d8-52d9fecefe55_Data.csv')

In [234]:
at1999_pruned = at1999.iloc[:265, :]

In [235]:
#at1999.tail(8)

In [236]:
at1999_pruned = at1999_pruned.rename(columns = {'Country Code': 'iso_o'})
at1999_pruned = at1999_pruned.rename(columns = {'Country Name': 'Country'})
at1999_pruned = at1999_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at1999_pruned = at1999_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at1999_pruned = at1999_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at1999_pruned = at1999_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [237]:
at1999_ordered = at1999_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [238]:
column = ['GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']
at1999_ordered[column] = at1999_ordered[column].replace('..', np.nan)
at1999_ordered[column] = at1999_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at1999_ordered[column] = at1999_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at1999_ordered[column] = at1999_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)


In [239]:
#GDP
mean_gdp_1999 = at1999_ordered['GDP'].mean()
at1999_ordered['GDP'] = at1999_ordered['GDP'].fillna(mean_gdp_1999)
at1999_ordered['log_GDP'] = np.log(at1999_ordered['GDP'])
at1999_ordered['log_trade_gdp_ratio'] = np.log(at1999_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at1999_ordered['GDP'] = at1999_ordered['GDP'].fillna(mean_gdp_1999)


In [240]:
total_trade= pd.read_csv("../data/cleaned/total_edgelist.csv")

In [241]:
m = set(at1999_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(m))
print(set(total_trade['iso_o'].unique()) - set(at1999_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [242]:
at1999_ordered = at1999_ordered[at1999_ordered['iso_o'].isin(m)]
at1999_ordered = at1999_ordered.reset_index(drop = True)

In [243]:
at1999_ordered.shape

(190, 9)

### 2000

In [244]:
at2000 = pd.read_csv('../data/raw/b8d427fb-dca4-47c0-b026-c7016418b6ed_Data.csv')

In [245]:
#at2000.tail()

In [246]:
at2000.shape

(271, 8)

In [247]:
at2000_pruned = at2000.iloc[:265, :]

In [248]:
at2000_pruned.tail()

Unnamed: 0,Time,Time Code,Country Name,Country Code,GDP (current US$) [NY.GDP.MKTP.CD],GDP per capita (current US$) [NY.GDP.PCAP.CD],"Population, total [SP.POP.TOTL]",Trade (% of GDP) [NE.TRD.GNFS.ZS]
260,2000,YR2000,South Asia (IDA & IBRD),TSA,544675445924.648,440.253667577913,1237185482,29.4521345638678
261,2000,YR2000,Sub-Saharan Africa,SSF,428318893884.446,628.840266615581,681125107,..
262,2000,YR2000,Sub-Saharan Africa (excluding high income),SSA,419427320519.798,683.515512661206,613632482,..
263,2000,YR2000,Sub-Saharan Africa (IDA & IBRD countries),TSS,428318893884.446,628.840266615582,681125107,..
264,2000,YR2000,Upper middle income,UMC,4367159322540.17,1846.56159645163,2365022283,47.2296254050292


In [249]:
at2000_pruned = at2000_pruned.rename(columns = {'Country Code': 'iso_o'})
at2000_pruned = at2000_pruned.rename(columns = {'Country Name': 'Country'})
at2000_pruned = at2000_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2000_pruned = at2000_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2000_pruned = at2000_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2000_pruned = at2000_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [250]:
#at2000_pruned.rename(columns={"Educational attainment, at least Bachelor's or equivalent, population 25+, total (%) (cumulative) [SE.TER.CUAT.BA.ZS]": "tertiary_edu"
#}, inplace=True)

In [251]:
at2000_ordered = at2000_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [252]:
column = ['GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']
at2000_ordered[column] = at2000_ordered[column].replace('..', np.nan)
at2000_ordered[column] = at2000_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2000_ordered[column] = at2000_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2000_ordered[column] = at2000_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)


In [253]:
missing = at2000_ordered.isna().sum() # no missing in population 

In [254]:
missing

iso_o               0
Country             0
Time                0
GDP                14
GDP_pct            14
population          1
trade_gdp_ratio    60
dtype: int64

In [255]:
#GDP
mean_gdp_2000 = at2000_ordered['GDP'].mean()
at2000_ordered['GDP'] = at2000_ordered['GDP'].fillna(mean_gdp_2000)
at2000_ordered['log_GDP'] = np.log(at2000_ordered['GDP'])
at2000_ordered['log_trade_gdp_ratio'] = np.log(at2000_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2000_ordered['GDP'] = at2000_ordered['GDP'].fillna(mean_gdp_2000)


In [256]:
total_trade= pd.read_csv("../data/cleaned/total_edgelist.csv")

In [257]:
m = set(at2000_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(m))
print(set(total_trade['iso_o'].unique()) - set(at2000_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [258]:
at2000_ordered = at2000_ordered[at2000_ordered['iso_o'].isin(m)]
at2000_ordered = at2000_ordered.reset_index(drop = True)

In [259]:
at2000_ordered.shape

(190, 9)

### 2001

In [260]:
at2001 = pd.read_csv('../data/raw/04b35699-249a-4d66-97a8-bab79c305702_Data.csv')

In [261]:
at2001_pruned = at2001.iloc[:265, :]

In [262]:
at2001_pruned.tail()

Unnamed: 0,Time,Time Code,Country Name,Country Code,GDP (current US$) [NY.GDP.MKTP.CD],GDP per capita (current US$) [NY.GDP.PCAP.CD],Trade (% of GDP) [NE.TRD.GNFS.ZS],"Population, total [SP.POP.TOTL]"
260,2001,YR2001,South Asia (IDA & IBRD),TSA,562554379798.02,446.418919348871,28.6226894432911,1260149056
261,2001,YR2001,Sub-Saharan Africa,SSF,410360395732.846,586.573887183773,..,699588585
262,2001,YR2001,Sub-Saharan Africa (excluding high income),SSA,401467455254.855,637.230082799048,..,630019621
263,2001,YR2001,Sub-Saharan Africa (IDA & IBRD countries),TSS,410360395732.846,586.573887183773,..,699588585
264,2001,YR2001,Upper middle income,UMC,4372018919642.63,1830.976201913,46.5419864976265,2387807616


In [263]:
at2001_pruned = at2001_pruned.rename(columns = {'Country Code': 'iso_o'})
at2001_pruned = at2001_pruned.rename(columns = {'Country Name': 'Country'})
at2001_pruned = at2001_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2001_pruned = at2001_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2001_pruned = at2001_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2001_pruned = at2001_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [264]:
at2001_ordered = at2001_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [265]:
column = ['GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']
at2001_ordered[column] = at2001_ordered[column].replace('..', np.nan)
at2001_ordered[column] = at2001_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2001 = at2001_ordered['GDP'].mean()
at2001_ordered['GDP'] = at2001_ordered['GDP'].fillna(mean_gdp_2001)
at2001_ordered['log_GDP'] = np.log(at2001_ordered['GDP'])
at2001_ordered['log_trade_gdp_ratio'] = np.log(at2001_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2001_ordered[column] = at2001_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2001_ordered[column] = at2001_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2001_ordered['GDP'] = at2

In [266]:
missing1 = at2001_ordered.isna().sum()
missing1

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                13
population              1
trade_gdp_ratio        60
log_GDP                 0
log_trade_gdp_ratio    60
dtype: int64

In [267]:
n = set(at2001_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(n))
print(set(total_trade['iso_o'].unique()) - set(at2001_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [268]:
at2001_ordered = at2001_ordered[at2001_ordered['iso_o'].isin(n)]
at2001_ordered = at2001_ordered.reset_index(drop = True)

In [269]:
at2001_ordered.shape

(190, 9)

### 2002

In [270]:
at2002 = pd.read_csv('../data/raw/12df4673-6147-46d1-99f3-e419b3f30633_Data.csv')

In [271]:
at2002_pruned = at2002.iloc[:265, :]

In [272]:
#at2002_pruned.tail()

In [273]:
at2002_pruned = at2002_pruned.rename(columns = {'Country Code': 'iso_o'})
at2002_pruned = at2002_pruned.rename(columns = {'Country Name': 'Country'})
at2002_pruned = at2002_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2002_pruned = at2002_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2002_pruned = at2002_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2002_pruned = at2002_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [274]:
at2002_ordered = at2002_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [275]:
at2002_ordered[column] = at2002_ordered[column].replace('..', np.nan)
at2002_ordered[column] = at2002_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2002 = at2002_ordered['GDP'].mean()
at2002_ordered['GDP'] = at2002_ordered['GDP'].fillna(mean_gdp_2002)
at2002_ordered['log_GDP'] = np.log(at2002_ordered['GDP'])
at2002_ordered['log_trade_gdp_ratio'] = np.log(at2002_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2002_ordered[column] = at2002_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2002_ordered[column] = at2002_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2002_ordered['GDP'] = at2

In [276]:
missing2 = at2002_ordered.isna().sum()
missing2

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 9
population              1
trade_gdp_ratio        54
log_GDP                 0
log_trade_gdp_ratio    54
dtype: int64

In [277]:
o = set(at2002_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(o))
print(set(total_trade['iso_o'].unique()) - set(at2002_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [278]:
at2002_ordered = at2002_ordered[at2002_ordered['iso_o'].isin(o)]
at2002_ordered = at2002_ordered.reset_index(drop = True)

In [279]:
at2002_ordered.shape

(190, 9)

In [280]:
at2002_ordered.head()

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2002,3825701000.0,178.95,21378117.0,,22.065008,
1,ALB,Albania,2002,4515003000.0,1479.84,3051010.0,65.99,22.230672,4.189503
2,DZA,Algeria,2002,61516100000.0,1937.46,31750835.0,56.59,24.842565,4.035832
3,AND,Andorra,2002,1764280000.0,26528.13,66506.0,,21.291008,
4,AGO,Angola,2002,15285590000.0,882.15,17327699.0,105.3,23.450177,4.656813


### 2003

In [281]:
at2003 = pd.read_csv('../data/raw/e9bcb05a-7626-40fb-ba91-0679770c2561_Data.csv')
at2003_pruned = at2003.iloc[:265, :]

In [282]:
at2003_pruned = at2003_pruned.rename(columns = {'Country Code': 'iso_o'})
at2003_pruned = at2003_pruned.rename(columns = {'Country Name': 'Country'})
at2003_pruned = at2003_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2003_pruned = at2003_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2003_pruned = at2003_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2003_pruned = at2003_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [283]:
#at2003_pruned.tail()

In [284]:
at2003_ordered = at2003_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [285]:
at2003_ordered[column] = at2003_ordered[column].replace('..', np.nan)
at2003_ordered[column] = at2003_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2003 = at2003_ordered['GDP'].mean()
at2003_ordered['GDP'] = at2003_ordered['GDP'].fillna(mean_gdp_2003)
at2003_ordered['log_GDP'] = np.log(at2003_ordered['GDP'])
at2003_ordered['log_trade_gdp_ratio'] = np.log(at2003_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2003_ordered[column] = at2003_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2003_ordered[column] = at2003_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2003_ordered['GDP'] = at2

In [286]:
missing3 = at2003_ordered.isna().sum()
missing3

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 9
population              1
trade_gdp_ratio        53
log_GDP                 0
log_trade_gdp_ratio    53
dtype: int64

In [287]:
p = set(at2003_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(p))
print(set(total_trade['iso_o'].unique()) - set(at2003_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [288]:
at2003_ordered = at2003_ordered[at2003_ordered['iso_o'].isin(p)]
at2003_ordered = at2003_ordered.reset_index(drop = True)

In [289]:
at2003_ordered.shape

(190, 9)

In [290]:
at2003_ordered.head()

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2003,4520947000.0,198.87,22733049.0,,22.231987,
1,ALB,Albania,2003,5801712000.0,1908.7,3039616.0,64.82,22.481419,4.171614
2,DZA,Algeria,2003,73482260000.0,2283.77,32175818.0,57.77,25.02031,4.05647
3,AND,Andorra,2003,2366942000.0,34063.58,69486.0,,21.584865,
4,AGO,Angola,2003,17812700000.0,992.7,17943712.0,103.9,23.603178,4.643429


### 2004

In [291]:
at2004 = pd.read_csv('../data/raw/4ba7dd03-beef-4117-bcf5-2741582ae3f0_Data.csv')
at2004_pruned = at2004.iloc[:265, :]

In [292]:
at2004_pruned = at2004_pruned.rename(columns = {'Country Code': 'iso_o'})
at2004_pruned = at2004_pruned.rename(columns = {'Country Name': 'Country'})
at2004_pruned = at2004_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2004_pruned = at2004_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2004_pruned = at2004_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2004_pruned = at2004_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [293]:
at2004_ordered = at2004_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [294]:
at2004_ordered[column] = at2004_ordered[column].replace('..', np.nan)
at2004_ordered[column] = at2004_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2004 = at2004_ordered['GDP'].mean()
at2004_ordered['GDP'] = at2004_ordered['GDP'].fillna(mean_gdp_2004)
at2004_ordered['log_GDP'] = np.log(at2004_ordered['GDP'])
at2004_ordered['log_trade_gdp_ratio'] = np.log(at2004_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2004_ordered[column] = at2004_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2004_ordered[column] = at2004_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2004_ordered['GDP'] = at2

In [295]:
missing4 = at2004_ordered.isna().sum()
missing4

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 9
population              1
trade_gdp_ratio        53
log_GDP                 0
log_trade_gdp_ratio    53
dtype: int64

In [296]:
q = set(at2004_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(q))
print(set(total_trade['iso_o'].unique()) - set(at2004_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [297]:
at2004_ordered = at2004_ordered[at2004_ordered['iso_o'].isin(q)]
at2004_ordered = at2004_ordered.reset_index(drop = True)

In [298]:
at2004_ordered.head()

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2004,5224897000.0,221.76,23560654.0,,22.376701,
1,ALB,Albania,2004,7406646000.0,2446.91,3026939.0,65.04,22.725644,4.175002
2,DZA,Algeria,2004,91913680000.0,2816.99,32628286.0,61.36,25.244116,4.116758
3,AND,Andorra,2004,2900245000.0,39021.13,74325.0,,21.788061,
4,AGO,Angola,2004,23552060000.0,1266.21,18600423.0,103.58,23.882479,4.640344


### 2005

In [299]:
at2005 = pd.read_csv('../data/raw/8c0c707a-94a0-4070-83d9-a9890aa84763_Data.csv')
at2005_pruned = at2005.iloc[:265, :]

In [300]:
at2005_pruned = at2005_pruned.rename(columns = {'Country Code': 'iso_o'})
at2005_pruned = at2005_pruned.rename(columns = {'Country Name': 'Country'})
at2005_pruned = at2005_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2005_pruned = at2005_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2005_pruned = at2005_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2005_pruned = at2005_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [301]:
at2005_ordered = at2005_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [302]:
at2005_ordered[column] = at2005_ordered[column].replace('..', np.nan)
at2005_ordered[column] = at2005_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2005 = at2005_ordered['GDP'].mean()
at2005_ordered['GDP'] = at2005_ordered['GDP'].fillna(mean_gdp_2005)
at2005_ordered['log_GDP'] = np.log(at2005_ordered['GDP'])
at2005_ordered['log_trade_gdp_ratio'] = np.log(at2005_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2005_ordered[column] = at2005_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2005_ordered[column] = at2005_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2005_ordered['GDP'] = at2

In [303]:
missing5 = at2005_ordered.isna().sum()
missing5

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 9
population              1
trade_gdp_ratio        49
log_GDP                 0
log_trade_gdp_ratio    49
dtype: int64

In [304]:
r = set(at2005_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(r))
print(set(total_trade['iso_o'].unique()) - set(at2005_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [305]:
at2005_ordered = at2005_ordered[at2005_ordered['iso_o'].isin(r)]
at2005_ordered = at2005_ordered.reset_index(drop = True)

In [306]:
at2005_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2005,6203257000.0,254.18,24404567.0,,22.54834,
1,ALB,Albania,2005,8256658000.0,2741.72,3011487.0,69.12,22.834286,4.235844
2,DZA,Algeria,2005,107046600000.0,3233.13,33109249.0,66.84,25.39653,4.202302


### 2006

In [307]:
at2006 = pd.read_csv('../data/raw/56f4554f-bddc-45c4-9bd7-da4b172d4d3c_Data.csv')
at2006_pruned = at2006.iloc[:265, :]

In [308]:
at2006_pruned = at2006_pruned.rename(columns = {'Country Code': 'iso_o'})
at2006_pruned = at2006_pruned.rename(columns = {'Country Name': 'Country'})
at2006_pruned = at2006_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2006_pruned = at2006_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2006_pruned = at2006_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2006_pruned = at2006_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [309]:
at2006_ordered = at2006_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [310]:
at2006_ordered[column] = at2006_ordered[column].replace('..', np.nan)
at2006_ordered[column] = at2006_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2006 = at2006_ordered['GDP'].mean()
at2006_ordered['GDP'] = at2006_ordered['GDP'].fillna(mean_gdp_2006)
at2006_ordered['log_GDP'] = np.log(at2006_ordered['GDP'])
at2006_ordered['log_trade_gdp_ratio'] = np.log(at2006_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2006_ordered[column] = at2006_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2006_ordered[column] = at2006_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2006_ordered['GDP'] = at2

In [311]:
missing6 = at2006_ordered.isna().sum()
missing6

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 8
population              1
trade_gdp_ratio        49
log_GDP                 0
log_trade_gdp_ratio    49
dtype: int64

In [312]:
s = set(at2006_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(s))
print(set(total_trade['iso_o'].unique()) - set(at2006_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [313]:
at2006_ordered = at2006_ordered[at2006_ordered['iso_o'].isin(s)]
at2006_ordered = at2006_ordered.reset_index(drop = True)

In [314]:
at2006_ordered.head(2)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2006,6971758000.0,274.22,25424094.0,,22.665133,
1,ALB,Albania,2006,9150528000.0,3057.77,2992547.0,72.2,22.937077,4.27944


### 2007

In [315]:
at2007 = pd.read_csv('../data/raw/616da076-f862-4f86-a7b3-7373f5ae84c1_Data.csv')
at2007_pruned = at2007.iloc[:265, :]

In [316]:
at2007_pruned = at2007_pruned.rename(columns = {'Country Code': 'iso_o'})
at2007_pruned = at2007_pruned.rename(columns = {'Country Name': 'Country'})
at2007_pruned = at2007_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2007_pruned = at2007_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2007_pruned = at2007_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2007_pruned = at2007_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [317]:
at2007_ordered = at2007_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [318]:
at2007_ordered[column] = at2007_ordered[column].replace('..', np.nan)
at2007_ordered[column] = at2007_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2007 = at2007_ordered['GDP'].mean()
at2007_ordered['GDP'] = at2007_ordered['GDP'].fillna(mean_gdp_2007)
at2007_ordered['log_GDP'] = np.log(at2007_ordered['GDP'])
at2007_ordered['log_trade_gdp_ratio'] = np.log(at2007_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2007_ordered[column] = at2007_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2007_ordered[column] = at2007_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2007_ordered['GDP'] = at2

In [319]:
missing7 = at2007_ordered.isna().sum()
missing7

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 8
population              1
trade_gdp_ratio        48
log_GDP                 0
log_trade_gdp_ratio    48
dtype: int64

In [320]:
t = set(at2007_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(t))
print(set(total_trade['iso_o'].unique()) - set(at2007_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [321]:
at2007_ordered = at2007_ordered[at2007_ordered['iso_o'].isin(s)]
at2007_ordered = at2007_ordered.reset_index(drop = True)

In [322]:
at2007_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2007,9747886000.0,376.22,25909852.0,,23.000316,
1,ALB,Albania,2007,11116940000.0,3743.06,2970017.0,79.91,23.131736,4.380901
2,DZA,Algeria,2007,142482700000.0,4167.45,34189416.0,67.55,25.682487,4.212868


### 2008

In [323]:
at2008 = pd.read_csv('../data/raw/2fc7e1df-39de-4796-9eed-6b22fc006594_Data.csv')
at2008_pruned = at2008.iloc[:265, :]

In [324]:
at2008_pruned = at2008_pruned.rename(columns = {'Country Code': 'iso_o'})
at2008_pruned = at2008_pruned.rename(columns = {'Country Name': 'Country'})
at2008_pruned = at2008_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2008_pruned = at2008_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2008_pruned = at2008_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2008_pruned = at2008_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [325]:
at2008_ordered = at2008_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [326]:
at2008_ordered[column] = at2008_ordered[column].replace('..', np.nan)
at2008_ordered[column] = at2008_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2008 = at2008_ordered['GDP'].mean()
at2008_ordered['GDP'] = at2008_ordered['GDP'].fillna(mean_gdp_2008)
at2008_ordered['log_GDP'] = np.log(at2008_ordered['GDP'])
at2008_ordered['log_trade_gdp_ratio'] = np.log(at2008_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2008_ordered[column] = at2008_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2008_ordered[column] = at2008_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2008_ordered['GDP'] = at2

In [327]:
missing8 = at2008_ordered.isna().sum()
missing8

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 7
population              1
trade_gdp_ratio        45
log_GDP                 0
log_trade_gdp_ratio    45
dtype: int64

In [328]:
u = set(at2008_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(u))
print(set(total_trade['iso_o'].unique()) - set(at2008_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [329]:
at2008_ordered = at2008_ordered[at2008_ordered['iso_o'].isin(u)]
at2008_ordered = at2008_ordered.reset_index(drop = True)

In [330]:
at2008_ordered.head()

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2008,10109300000.0,381.73,26482622.0,,23.036721,
1,ALB,Albania,2008,13258510000.0,4498.5,2947314.0,75.25,23.307905,4.320816
2,DZA,Algeria,2008,180383800000.0,5180.92,34816961.0,71.03,25.918353,4.263102
3,AND,Andorra,2008,4102319000.0,49132.52,83495.0,,22.134818,
4,AGO,Angola,2008,88538670000.0,4103.07,21578655.0,121.36,25.206705,4.798761


### 2009

In [331]:
at2009 = pd.read_csv('../data/raw/67e77636-af13-402a-9705-edf48957b7e7_Data.csv')
at2009_pruned = at2009.iloc[:265, :]

In [332]:
at2009_pruned = at2009_pruned.rename(columns = {'Country Code': 'iso_o'})
at2009_pruned = at2009_pruned.rename(columns = {'Country Name': 'Country'})
at2009_pruned = at2009_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2009_pruned = at2009_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2009_pruned = at2009_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2009_pruned = at2009_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [333]:
at2009_ordered = at2009_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [334]:
at2009_ordered[column] = at2009_ordered[column].replace('..', np.nan)
at2009_ordered[column] = at2009_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2009 = at2009_ordered['GDP'].mean()
at2009_ordered['GDP'] = at2009_ordered['GDP'].fillna(mean_gdp_2009)
at2009_ordered['log_GDP'] = np.log(at2009_ordered['GDP'])
at2009_ordered['log_trade_gdp_ratio'] = np.log(at2009_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2009_ordered[column] = at2009_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2009_ordered[column] = at2009_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2009_ordered['GDP'] = at2

In [335]:
missing9 = at2009_ordered.isna().sum()
missing9

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 5
population              1
trade_gdp_ratio        45
log_GDP                 0
log_trade_gdp_ratio    45
dtype: int64

In [336]:
v = set(at2009_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(v))
print(set(total_trade['iso_o'].unique()) - set(at2009_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [337]:
at2009_ordered = at2009_ordered[at2009_ordered['iso_o'].isin(v)]
at2009_ordered = at2009_ordered.reset_index(drop = True)

In [338]:
at2009_ordered.head()

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2009,12416150000.0,452.05,27466101.0,,23.242264,
1,ALB,Albania,2009,12335540000.0,4213.65,2927519.0,73.32,23.23575,4.294833
2,DZA,Algeria,2009,150317300000.0,4235.43,35490445.0,64.28,25.736014,4.163249
3,AND,Andorra,2009,3688976000.0,43975.02,83888.0,,22.028615,
4,AGO,Angola,2009,70307200000.0,3136.65,22414773.0,122.45,24.97614,4.807703


### 2010

In [339]:
at2010 = pd.read_csv('../data/raw/21dfe8ea-d81a-491b-b5af-5cc76407ebce_Data.csv')
at2010_pruned = at2010.iloc[:265, :]

In [340]:
at2010_pruned = at2010_pruned.rename(columns = {'Country Code': 'iso_o'})
at2010_pruned = at2010_pruned.rename(columns = {'Country Name': 'Country'})
at2010_pruned = at2010_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2010_pruned = at2010_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2010_pruned = at2010_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2010_pruned = at2010_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [341]:
at2010_ordered = at2010_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [342]:
at2010_ordered[column] = at2010_ordered[column].replace('..', np.nan)
at2010_ordered[column] = at2010_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2010 = at2010_ordered['GDP'].mean()
at2010_ordered['GDP'] = at2010_ordered['GDP'].fillna(mean_gdp_2010)
at2010_ordered['log_GDP'] = np.log(at2010_ordered['GDP'])
at2010_ordered['log_trade_gdp_ratio'] = np.log(at2010_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2010_ordered[column] = at2010_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2010_ordered[column] = at2010_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2010_ordered['GDP'] = at2

In [343]:
missing10 = at2010_ordered.isna().sum()
missing10

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 5
population              1
trade_gdp_ratio        45
log_GDP                 0
log_trade_gdp_ratio    45
dtype: int64

In [344]:
w = set(at2010_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(w))
print(set(total_trade['iso_o'].unique()) - set(at2010_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [345]:
at2010_ordered = at2010_ordered[at2010_ordered['iso_o'].isin(v)]
at2010_ordered = at2010_ordered.reset_index(drop = True)

In [346]:
at2010_ordered.head()

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2010,15856670000.0,560.62,28284089.0,,23.486856,
1,ALB,Albania,2010,12086550000.0,4149.14,2913021.0,75.53,23.215359,4.32453
2,DZA,Algeria,2010,177785100000.0,4912.79,36188236.0,63.49,25.903841,4.150882
3,AND,Andorra,2010,3449926000.0,42746.83,80706.0,,21.961619,
4,AGO,Angola,2010,83799470000.0,3597.34,23294825.0,104.12,25.151693,4.645544


### 2011

In [347]:
at2011 = pd.read_csv('../data/raw/afedc3eb-c374-45cd-989c-6c3559a87669_Data.csv')
at2011_pruned = at2011.iloc[:265, :]

In [348]:
at2011_pruned = at2011_pruned.rename(columns = {'Country Code': 'iso_o'})
at2011_pruned = at2011_pruned.rename(columns = {'Country Name': 'Country'})
at2011_pruned = at2011_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2011_pruned = at2011_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2011_pruned = at2011_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2011_pruned = at2011_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [349]:
at2011_ordered = at2011_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [350]:
at2011_ordered[column] = at2011_ordered[column].replace('..', np.nan)
at2011_ordered[column] = at2011_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2011 = at2011_ordered['GDP'].mean()
at2011_ordered['GDP'] = at2011_ordered['GDP'].fillna(mean_gdp_2011)
at2011_ordered['log_GDP'] = np.log(at2011_ordered['GDP'])
at2011_ordered['log_trade_gdp_ratio'] = np.log(at2011_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2011_ordered[column] = at2011_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2011_ordered[column] = at2011_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2011_ordered['GDP'] = at2

In [351]:
x = set(at2011_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(x))
print(set(total_trade['iso_o'].unique()) - set(at2011_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [352]:
at2011_ordered = at2011_ordered[at2011_ordered['iso_o'].isin(x)]
at2011_ordered = at2011_ordered.reset_index(drop = True)

In [353]:
at2011_ordered.head()

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2011,17805100000.0,606.69,29347708.0,,23.602751,
1,ALB,Albania,2011,12973760000.0,4465.71,2905195.0,80.7,23.286194,4.390739
2,DZA,Algeria,2011,218331900000.0,5916.31,36903376.0,62.22,26.109282,4.130676
3,AND,Andorra,2011,3629134000.0,46657.16,77783.0,,22.01226,
4,AGO,Angola,2011,111789700000.0,4615.91,24218352.0,99.98,25.439886,4.60497


### 2012

In [354]:
at2012 = pd.read_csv('../data/raw/211c33af-d726-4d73-8184-b4c3a1eb98a8_Data.csv')
at2012_pruned = at2012.iloc[:265, :]

In [355]:
at2012_pruned = at2012_pruned.rename(columns = {'Country Code': 'iso_o'})
at2012_pruned = at2012_pruned.rename(columns = {'Country Name': 'Country'})
at2012_pruned = at2012_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2012_pruned = at2012_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2012_pruned = at2012_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2012_pruned = at2012_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [356]:
at2012_ordered = at2012_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [357]:
at2012_ordered[column] = at2012_ordered[column].replace('..', np.nan)
at2012_ordered[column] = at2012_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2012 = at2012_ordered['GDP'].mean()
at2012_ordered['GDP'] = at2012_ordered['GDP'].fillna(mean_gdp_2012)
at2012_ordered['log_GDP'] = np.log(at2012_ordered['GDP'])
at2012_ordered['log_trade_gdp_ratio'] = np.log(at2012_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2012_ordered[column] = at2012_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2012_ordered[column] = at2012_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2012_ordered['GDP'] = at2

In [358]:
at2012_ordered.head(2)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2012,19907330000.0,651.42,30560034.0,,23.714354,
1,ALB,Albania,2012,12246500000.0,4222.35,2900401.0,76.97,23.228506,4.343416


In [359]:
y = set(at2012_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(y))
print(set(total_trade['iso_o'].unique()) - set(at2012_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [360]:
at2012_ordered = at2012_ordered[at2012_ordered['iso_o'].isin(x)]
at2012_ordered = at2012_ordered.reset_index(drop = True)

### 2013

In [361]:
at2013 = pd.read_csv('../data/raw/dcfb9366-1163-4249-82b1-10ee87141062_Data.csv')
at2013_pruned = at2013.iloc[:265, :]

In [362]:
at2013_pruned = at2013_pruned.rename(columns = {'Country Code': 'iso_o'})
at2013_pruned = at2013_pruned.rename(columns = {'Country Name': 'Country'})
at2013_pruned = at2013_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2013_pruned = at2013_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2013_pruned = at2013_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2013_pruned = at2013_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [363]:
at2013_ordered = at2013_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [364]:
at2013_ordered[column] = at2013_ordered[column].replace('..', np.nan)
at2013_ordered[column] = at2013_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2013 = at2013_ordered['GDP'].mean()
at2013_ordered['GDP'] = at2013_ordered['GDP'].fillna(mean_gdp_2013)
at2013_ordered['log_GDP'] = np.log(at2013_ordered['GDP'])
at2013_ordered['log_trade_gdp_ratio'] = np.log(at2013_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2013_ordered[column] = at2013_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2013_ordered[column] = at2013_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2013_ordered['GDP'] = at2

In [365]:
z = set(at2013_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(z))
print(set(total_trade['iso_o'].unique()) - set(at2013_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [366]:
at2013_ordered = at2013_ordered[at2013_ordered['iso_o'].isin(x)]
at2013_ordered = at2013_ordered.reset_index(drop = True)

In [367]:
at2013_ordered.head(2)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2013,20146420000.0,637.09,31622704.0,,23.726292,
1,ALB,Albania,2013,12796990000.0,4420.23,2895092.0,75.75,23.272476,4.327438


### 2014

In [368]:
at2014 = pd.read_csv('../data/raw/208001ea-808a-41e1-a6f5-b26df80309cd_Data.csv')
at2014_pruned = at2014.iloc[:265, :]

In [369]:
at2014_pruned = at2014_pruned.rename(columns = {'Country Code': 'iso_o'})
at2014_pruned = at2014_pruned.rename(columns = {'Country Name': 'Country'})
at2014_pruned = at2014_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2014_pruned = at2014_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2014_pruned = at2014_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2014_pruned = at2014_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [370]:
at2014_ordered = at2014_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [371]:
at2014_ordered[column] = at2014_ordered[column].replace('..', np.nan)
at2014_ordered[column] = at2014_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2014 = at2014_ordered['GDP'].mean()
at2014_ordered['GDP'] = at2014_ordered['GDP'].fillna(mean_gdp_2014)
at2014_ordered['log_GDP'] = np.log(at2014_ordered['GDP'])
at2014_ordered['log_trade_gdp_ratio'] = np.log(at2014_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2014_ordered[column] = at2014_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2014_ordered[column] = at2014_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2014_ordered['GDP'] = at2

In [372]:
a = set(at2014_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(a))
print(set(total_trade['iso_o'].unique()) - set(at2014_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [373]:
at2014_ordered = at2014_ordered[at2014_ordered['iso_o'].isin(a)]
at2014_ordered = at2014_ordered.reset_index(drop = True)

In [374]:
at2014_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2014,20497130000.0,625.05,32792523.0,,23.743551,
1,ALB,Albania,2014,13296320000.0,4602.23,2889104.0,75.02,23.310753,4.317755
2,DZA,Algeria,2014,238942700000.0,6094.69,39205031.0,56.92,26.199489,4.041647


### 2015

In [375]:
at2015 = pd.read_csv('../data/raw/edbff4cc-3f9d-4141-81d0-e8217df7ac6e_Data.csv')
at2015_pruned = at2015.iloc[:265, :]

In [376]:
at2015_pruned = at2015_pruned.rename(columns = {'Country Code': 'iso_o'})
at2015_pruned = at2015_pruned.rename(columns = {'Country Name': 'Country'})
at2015_pruned = at2015_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2015_pruned = at2015_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2015_pruned = at2015_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2015_pruned = at2015_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [377]:
at2015_ordered = at2015_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [378]:
at2015_ordered[column] = at2015_ordered[column].replace('..', np.nan)
at2015_ordered[column] = at2015_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2015 = at2015_ordered['GDP'].mean()
at2015_ordered['GDP'] = at2015_ordered['GDP'].fillna(mean_gdp_2015)
at2015_ordered['log_GDP'] = np.log(at2015_ordered['GDP'])
at2015_ordered['log_trade_gdp_ratio'] = np.log(at2015_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2015_ordered[column] = at2015_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2015_ordered[column] = at2015_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2015_ordered['GDP'] = at2

In [379]:
b = set(at2015_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(b))
print(set(total_trade['iso_o'].unique()) - set(at2015_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [380]:
at2015_ordered = at2015_ordered[at2015_ordered['iso_o'].isin(b)]
at2015_ordered = at2015_ordered.reset_index(drop = True)

In [381]:
at2015_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2015,19134220000.0,565.57,33831764.0,,23.674744,
1,ALB,Albania,2015,11470170000.0,3981.73,2880703.0,71.28,23.163016,4.266616
2,DZA,Algeria,2015,187493900000.0,4685.06,40019529.0,53.2,25.957012,3.974058


### 2016

In [382]:
at2016 = pd.read_csv('../data/raw/db97317f-3a63-42d8-ab78-5156a215931d_Data.csv')
at2016_pruned = at2016.iloc[:265, :]

In [383]:
at2016_pruned = at2016_pruned.rename(columns = {'Country Code': 'iso_o'})
at2016_pruned = at2016_pruned.rename(columns = {'Country Name': 'Country'})
at2016_pruned = at2016_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2016_pruned = at2016_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2016_pruned = at2016_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2016_pruned = at2016_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [384]:
at2016_ordered = at2016_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [385]:
at2016_ordered[column] = at2016_ordered[column].replace('..', np.nan)
at2016_ordered[column] = at2016_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2016 = at2016_ordered['GDP'].mean()
at2016_ordered['GDP'] = at2016_ordered['GDP'].fillna(mean_gdp_2016)
at2016_ordered['log_GDP'] = np.log(at2016_ordered['GDP'])
at2016_ordered['log_trade_gdp_ratio'] = np.log(at2016_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2016_ordered[column] = at2016_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2016_ordered[column] = at2016_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2016_ordered['GDP'] = at2

In [386]:
at2016_ordered.shape

(265, 9)

In [387]:
c = set(at2016_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(c))
print(set(total_trade['iso_o'].unique()) - set(at2016_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [388]:
at2016_ordered = at2016_ordered[at2016_ordered['iso_o'].isin(c)]
at2016_ordered = at2016_ordered.reset_index(drop = True)

In [389]:
at2016_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2016,18116570000.0,522.08,34700612.0,,23.620093,
1,ALB,Albania,2016,11988670000.0,4168.38,2876101.0,74.01,23.207228,4.3042
2,DZA,Algeria,2016,180763800000.0,4424.99,40850721.0,49.76,25.920457,3.907211


### 2017

In [390]:
at2017 = pd.read_csv('../data/raw/358bd016-ff18-4e14-99cf-fff07cb91245_Data.csv')
at2017_pruned = at2017.iloc[:265, :]

In [391]:
at2017_pruned = at2017_pruned.rename(columns = {'Country Code': 'iso_o'})
at2017_pruned = at2017_pruned.rename(columns = {'Country Name': 'Country'})
at2017_pruned = at2017_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2017_pruned = at2017_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2017_pruned = at2017_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2017_pruned = at2017_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [392]:
at2017_ordered = at2017_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [393]:
at2017_ordered[column] = at2017_ordered[column].replace('..', np.nan)
at2017_ordered[column] = at2017_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2017 = at2017_ordered['GDP'].mean()
at2017_ordered['GDP'] = at2017_ordered['GDP'].fillna(mean_gdp_2017)
at2017_ordered['log_GDP'] = np.log(at2017_ordered['GDP'])
at2017_ordered['log_trade_gdp_ratio'] = np.log(at2017_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2017_ordered[column] = at2017_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2017_ordered[column] = at2017_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2017_ordered['GDP'] = at2

In [394]:
d = set(at2017_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(d))
print(set(total_trade['iso_o'].unique()) - set(at2017_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [395]:
at2017_ordered = at2017_ordered[at2017_ordered['iso_o'].isin(d)]
at2017_ordered = at2017_ordered.reset_index(drop = True)

In [396]:
at2017_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2017,18753460000.0,525.47,35688935.0,,23.654644,
1,ALB,Albania,2017,13258270000.0,4614.05,2873457.0,76.79,23.307887,4.341074
2,DZA,Algeria,2017,189880900000.0,4554.67,41689299.0,49.82,25.969663,3.908417


### 2018

In [397]:
at2018 = pd.read_csv('../data/raw/67fe077d-c534-487a-9ae1-06e1e4c2cdf7_Data.csv')
at2018_pruned = at2018.iloc[:265, :]

In [398]:
at2018_pruned = at2018_pruned.rename(columns = {'Country Code': 'iso_o'})
at2018_pruned = at2018_pruned.rename(columns = {'Country Name': 'Country'})
at2018_pruned = at2018_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2018_pruned = at2018_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2018_pruned = at2018_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2018_pruned = at2018_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [399]:
at2018_ordered = at2018_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [400]:
at2018_ordered[column] = at2018_ordered[column].replace('..', np.nan)
at2018_ordered[column] = at2018_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2018 = at2018_ordered['GDP'].mean()
at2018_ordered['GDP'] = at2018_ordered['GDP'].fillna(mean_gdp_2018)
at2018_ordered['log_GDP'] = np.log(at2018_ordered['GDP'])
at2018_ordered['log_trade_gdp_ratio'] = np.log(at2018_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2018_ordered[column] = at2018_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2018_ordered[column] = at2018_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2018_ordered['GDP'] = at2

In [401]:
e = set(at2018_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(e))
print(set(total_trade['iso_o'].unique()) - set(at2018_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [402]:
at2018_ordered = at2018_ordered[at2018_ordered['iso_o'].isin(e)]
at2018_ordered = at2018_ordered.reset_index(drop = True)

In [403]:
at2018_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2018,18053220000.0,491.34,36743039.0,,23.61659,
1,ALB,Albania,2018,15379510000.0,5365.49,2866376.0,75.69,23.456302,4.326646
2,DZA,Algeria,2018,194554500000.0,4577.21,42505035.0,52.44,25.993978,3.95967


### 2019

In [404]:
at2019 = pd.read_csv('../data/raw/d9e55111-e6c1-4649-9899-5f5de8d3a55e_Data.csv')
at2019_pruned = at2019.iloc[:265, :]

In [405]:
at2019_pruned = at2019_pruned.rename(columns = {'Country Code': 'iso_o'})
at2019_pruned = at2019_pruned.rename(columns = {'Country Name': 'Country'})
at2019_pruned = at2019_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2019_pruned = at2019_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2019_pruned = at2019_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2019_pruned = at2019_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [406]:
at2019_ordered = at2019_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [407]:
at2019_ordered[column] = at2019_ordered[column].replace('..', np.nan)
at2019_ordered[column] = at2019_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2019 = at2019_ordered['GDP'].mean()
at2019_ordered['GDP'] = at2019_ordered['GDP'].fillna(mean_gdp_2019)
at2019_ordered['log_GDP'] = np.log(at2019_ordered['GDP'])
at2019_ordered['log_trade_gdp_ratio'] = np.log(at2019_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2019_ordered[column] = at2019_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2019_ordered[column] = at2019_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2019_ordered['GDP'] = at2

In [408]:
f = set(at2019_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(f))
print(set(total_trade['iso_o'].unique()) - set(at2019_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [409]:
at2019_ordered = at2019_ordered[at2019_ordered['iso_o'].isin(f)]
at2019_ordered = at2019_ordered.reset_index(drop = True)

In [410]:
at2019_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2019,18799440000.0,496.6,37856121.0,,23.657093,
1,ALB,Albania,2019,15585110000.0,5460.43,2854191.0,75.38,23.469582,4.322542
2,DZA,Algeria,2019,193459700000.0,4468.45,43294546.0,46.51,25.988335,3.839667


### 2020

In [411]:
at2020 = pd.read_csv('../data/raw/4b321dfd-f890-48d9-92af-5de0091f24ff_Data.csv')
at2020_pruned = at2020.iloc[:265, :]

In [412]:
at2020_pruned = at2020_pruned.rename(columns = {'Country Code': 'iso_o'})
at2020_pruned = at2020_pruned.rename(columns = {'Country Name': 'Country'})
at2020_pruned = at2020_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2020_pruned = at2020_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2020_pruned = at2020_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2020_pruned = at2020_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [413]:
at2020_ordered = at2020_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [414]:
at2020_ordered[column] = at2020_ordered[column].replace('..', np.nan)
at2020_ordered[column] = at2020_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2020 = at2020_ordered['GDP'].mean()
at2020_ordered['GDP'] = at2020_ordered['GDP'].fillna(mean_gdp_2020)
at2020_ordered['log_GDP'] = np.log(at2020_ordered['GDP'])
at2020_ordered['log_trade_gdp_ratio'] = np.log(at2020_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2020_ordered[column] = at2020_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2020_ordered[column] = at2020_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2020_ordered['GDP'] = at2

In [415]:
g = set(at2020_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(g))
print(set(total_trade['iso_o'].unique()) - set(at2020_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [416]:
at2020_ordered = at2020_ordered[at2020_ordered['iso_o'].isin(g)]
at2020_ordered = at2020_ordered.reset_index(drop = True)

In [417]:
at2020_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2020,19955930000.0,510.79,39068979.0,46.71,23.716792,3.843958
1,ALB,Albania,2020,15241460000.0,5370.78,2837849.0,59.52,23.447285,4.086312
2,DZA,Algeria,2020,164873400000.0,3743.54,44042091.0,40.39,25.828444,3.698582


### 2021

In [418]:
at2021 = pd.read_csv('../data/raw/32827991-5e23-4783-8b4a-dfed21113a1d_Data.csv')
at2021_pruned = at2021.iloc[:265, :]

In [419]:
at2021_pruned = at2021_pruned.rename(columns = {'Country Code': 'iso_o'})
at2021_pruned = at2021_pruned.rename(columns = {'Country Name': 'Country'})
at2021_pruned = at2021_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2021_pruned = at2021_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2021_pruned = at2021_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2021_pruned = at2021_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [420]:
at2021_ordered = at2021_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [421]:
at2021_ordered[column] = at2021_ordered[column].replace('..', np.nan)
at2021_ordered[column] = at2021_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2021 = at2021_ordered['GDP'].mean()
at2021_ordered['GDP'] = at2021_ordered['GDP'].fillna(mean_gdp_2021)
at2021_ordered['log_GDP'] = np.log(at2021_ordered['GDP'])
at2021_ordered['log_trade_gdp_ratio'] = np.log(at2021_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2021_ordered[column] = at2021_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2021_ordered[column] = at2021_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2021_ordered['GDP'] = at2

In [422]:
h = set(at2021_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(h))
print(set(total_trade['iso_o'].unique()) - set(at2021_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [423]:
at2021_ordered = at2021_ordered[at2021_ordered['iso_o'].isin(h)]
at2021_ordered = at2021_ordered.reset_index(drop = True)

In [424]:
at2021_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2021,14260000000.0,356.5,40000412.0,51.41,23.380724,3.939833
1,ALB,Albania,2021,18032010000.0,6413.28,2811666.0,75.59,23.615414,4.325324
2,DZA,Algeria,2021,186231200000.0,4160.56,44761099.0,46.84,25.950255,3.846738


### 2022

In [425]:
at2022 = pd.read_csv('../data/raw/eaf573f6-9b89-4ad5-bd2a-05fcbd790eaf_Data.csv')
at2022_pruned = at2022.iloc[:265, :]

In [426]:
at2022_pruned = at2022_pruned.rename(columns = {'Country Code': 'iso_o'})
at2022_pruned = at2022_pruned.rename(columns = {'Country Name': 'Country'})
at2022_pruned = at2022_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2022_pruned = at2022_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2022_pruned = at2022_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2022_pruned = at2022_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [427]:
at2022_ordered = at2022_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [428]:
at2022_ordered[column] = at2022_ordered[column].replace('..', np.nan)
at2022_ordered[column] = at2022_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2022 = at2022_ordered['GDP'].mean()
at2022_ordered['GDP'] = at2022_ordered['GDP'].fillna(mean_gdp_2022)
at2022_ordered['log_GDP'] = np.log(at2022_ordered['GDP'])
at2022_ordered['log_trade_gdp_ratio'] = np.log(at2022_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2022_ordered[column] = at2022_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2022_ordered[column] = at2022_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2022_ordered['GDP'] = at2

In [429]:
i = set(at2022_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(i))
print(set(total_trade['iso_o'].unique()) - set(at2022_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [430]:
at2022_ordered = at2022_ordered[at2022_ordered['iso_o'].isin(i)]
at2022_ordered = at2022_ordered.reset_index(drop = True)

In [431]:
at2022_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2022,14497240000.0,357.26,40578842.0,72.89,23.397224,4.288951
1,ALB,Albania,2022,19017240000.0,6846.43,2777689.0,84.7,23.668612,4.439116
2,DZA,Algeria,2022,225638500000.0,4961.55,45477389.0,51.2,26.1422,3.93574


### 2023

In [432]:
at2023 = pd.read_csv('../data/raw/446d4be1-fb6f-4d27-8275-9045049ee4c6_Data.csv')
at2023_pruned = at2023.iloc[:265, :]

In [433]:
at2023_pruned = at2023_pruned.rename(columns = {'Country Code': 'iso_o'})
at2023_pruned = at2023_pruned.rename(columns = {'Country Name': 'Country'})
at2023_pruned = at2023_pruned.rename(columns = {'GDP (current US$) [NY.GDP.MKTP.CD]': 'GDP'})
at2023_pruned = at2023_pruned.rename(columns = {'GDP per capita (current US$) [NY.GDP.PCAP.CD]': 'GDP_pct'})
at2023_pruned = at2023_pruned.rename(columns = {'Population, total [SP.POP.TOTL]': 'population'})
at2023_pruned = at2023_pruned.rename(columns = {'Trade (% of GDP) [NE.TRD.GNFS.ZS]': 'trade_gdp_ratio'})

In [434]:
at2023_ordered = at2023_pruned[['iso_o', 'Country', 'Time', 'GDP', 'GDP_pct', 'population', 'trade_gdp_ratio']]

In [435]:
at2023_ordered[column] = at2023_ordered[column].replace('..', np.nan)
at2023_ordered[column] = at2023_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)

mean_gdp_2023 = at2023_ordered['GDP'].mean()
at2023_ordered['GDP'] = at2023_ordered['GDP'].fillna(mean_gdp_2023)
at2023_ordered['log_GDP'] = np.log(at2023_ordered['GDP'])
at2023_ordered['log_trade_gdp_ratio'] = np.log(at2023_ordered['trade_gdp_ratio'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2023_ordered[column] = at2023_ordered[column].replace('..', np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2023_ordered[column] = at2023_ordered[column].apply(lambda x: pd.to_numeric(x, errors='coerce')).round(2)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  at2023_ordered['GDP'] = at2

In [436]:
j = set(at2023_ordered['iso_o'].unique()) & set(total_trade['iso_o'].unique())
print(len(j))
print(set(total_trade['iso_o'].unique()) - set(at2023_ordered['iso_o'].unique()))

190
{'MSR', 'TWN'}


In [437]:
at2023_ordered = at2023_ordered[at2023_ordered['iso_o'].isin(j)]
at2023_ordered = at2023_ordered.reset_index(drop = True)

In [438]:
at2023_ordered.head(3)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,2023,17152230000.0,413.76,41454761.0,67.58,23.565394,4.213312
1,ALB,Albania,2023,23547180000.0,8575.17,2745972.0,82.49,23.882272,4.412677
2,DZA,Algeria,2023,247626200000.0,5364.03,46164219.0,43.68,26.235186,3.77689


In [439]:
at1999_ordered.to_csv('../data/cleaned/attribute1999.csv', encoding='utf-8', index=False)
at2000_ordered.to_csv('../data/cleaned/attribute2000.csv', encoding='utf-8', index=False)
at2001_ordered.to_csv('../data/cleaned/attribute2001.csv', encoding='utf-8', index=False)
at2002_ordered.to_csv('../data/cleaned/attribute2002.csv', encoding='utf-8', index=False)
at2003_ordered.to_csv('../data/cleaned/attribute2003.csv', encoding='utf-8', index=False)
at2004_ordered.to_csv('../data/cleaned/attribute2004.csv', encoding='utf-8', index=False)
at2005_ordered.to_csv('../data/cleaned/attribute2005.csv', encoding='utf-8', index=False)
at2006_ordered.to_csv('../data/cleaned/attribute2006.csv', encoding='utf-8', index=False)
at2007_ordered.to_csv('../data/cleaned/attribute2007.csv', encoding='utf-8', index=False)
at2008_ordered.to_csv('../data/cleaned/attribute2008.csv', encoding='utf-8', index=False)
at2009_ordered.to_csv('../data/cleaned/attribute2009.csv', encoding='utf-8', index=False)
at2010_ordered.to_csv('../data/cleaned/attribute2010.csv', encoding='utf-8', index=False)
at2011_ordered.to_csv('../data/cleaned/attribute2011.csv', encoding='utf-8', index=False)
at2012_ordered.to_csv('../data/cleaned/attribute2012.csv', encoding='utf-8', index=False)
at2013_ordered.to_csv('../data/cleaned/attribute2013.csv', encoding='utf-8', index=False)
at2014_ordered.to_csv('../data/cleaned/attribute2014.csv', encoding='utf-8', index=False)
at2015_ordered.to_csv('../data/cleaned/attribute2015.csv', encoding='utf-8', index=False)
at2016_ordered.to_csv('../data/cleaned/attribute2016.csv', encoding='utf-8', index=False)
at2017_ordered.to_csv('../data/cleaned/attribute2017.csv', encoding='utf-8', index=False)
at2018_ordered.to_csv('../data/cleaned/attribute2018.csv', encoding='utf-8', index=False)
at2019_ordered.to_csv('../data/cleaned/attribute2019.csv', encoding='utf-8', index=False)
at2020_ordered.to_csv('../data/cleaned/attribute2020.csv', encoding='utf-8', index=False)
at2021_ordered.to_csv('../data/cleaned/attribute2021.csv', encoding='utf-8', index=False)
at2022_ordered.to_csv('../data/cleaned/attribute2022.csv', encoding='utf-8', index=False)
at2023_ordered.to_csv('../data/cleaned/attribute2023.csv', encoding='utf-8', index=False)

### Preparing the RTA data

This dataset is collected from `https://www.ewf.uni-bayreuth.de/en/research/RTA-data/index.html`
### Steps:
1. keeping only the 'rta column and the years after 1999
2. checking for duplicates
3. removing self-loops
4. saving in the cleaned subfolder of the data folder

In [440]:
rta = pd.read_csv("../data/raw/rta_20241028.csv")

In [441]:
rta  = rta.rename(columns = {'exporter': 'iso_o'})
rta  = rta.rename(columns = {'importer': 'iso_d'})

In [442]:
rta['iso_o'].nunique()

280

In [443]:
rta = rta[rta['iso_o'] != rta['iso_d']]

In [444]:
print(rta.groupby(['iso_o', 'iso_d']).ngroups)

78120


In [445]:
rta.shape[0]

5780880

In [446]:
rta = rta[rta['year'] >= 2000]

In [447]:
rta.shape

(1874880, 11)

In [448]:
rta = rta[['iso_o', 'iso_d', 'year', 'rta']].reset_index(drop = True)

In [449]:
rta.head(2)

Unnamed: 0,iso_o,iso_d,year,rta
0,ABW,AFG,2000,0
1,ABW,AFG,2001,0


In [450]:
rta.to_csv("../data/cleaned/rta_long.csv", encoding = 'utf-8', index = False)

In [451]:
rta_edgelist = rta.pivot_table(values = 'rta', index = ['iso_o', 'iso_d'], columns = 'year').reset_index()

In [452]:
rta_edgelist[(rta_edgelist['iso_o'] == 'BGD') & (rta_edgelist['iso_d'] == 'IND')] #sanity check 

year,iso_o,iso_d,2000,2001,2002,2003,2004,2005,2006,2007,...,2014,2015,2016,2017,2018,2019,2020,2021,2022,2023
7370,BGD,IND,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [453]:
rta_edgelist.to_csv("../data/cleaned/rta_edgelist.csv", encoding = 'utf-8', index = False)

### Concatenating time variant indicators for all countries in a single data frame

In [454]:
long = pd.concat([at1999_ordered, at2000_ordered, at2001_ordered, at2002_ordered, at2003_ordered, at2004_ordered, at2005_ordered, at2006_ordered, at2007_ordered, at2008_ordered,at2009_ordered, at2010_ordered, at2011_ordered, at2012_ordered, at2013_ordered, at2014_ordered, at2015_ordered, at2016_ordered, at2017_ordered, at2018_ordered, at2019_ordered, at2020_ordered, at2021_ordered, at2022_ordered, at2023_ordered], axis = 0) 

In [455]:
print(long['iso_o'].nunique())
print(long['Time'].nunique())
print(long['Time'].unique())

190
25
['1999' '2000' '2001' '2002' '2003' '2004' '2005' '2006' '2007' '2008'
 '2009' '2010' '2011' '2012' '2013' '2014' '2015' '2016' '2017' '2018'
 '2019' '2020' '2021' '2022' '2023']


In [456]:
long.head(2)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,AFG,Afghanistan,1999,834785900000.0,,19887785.0,,27.450441,
1,ALB,Albania,1999,3283942000.0,1056.34,3108778.0,49.9,21.91231,3.910021


In [457]:
missing10 = at2010_ordered.isna().sum()
missing10

iso_o                   0
Country                 0
Time                    0
GDP                     0
GDP_pct                 1
population              0
trade_gdp_ratio        24
log_GDP                 0
log_trade_gdp_ratio    24
dtype: int64

In [458]:
at2010_ordered[at2010_ordered['GDP_pct'].isna()]

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
93,PRK,"Korea, Dem. People's Rep.",2010,1798902000000.0,,24987258.0,,28.218197,


In [459]:
long_sorted = long.sort_values(['iso_o', 'Country', 'Time']).reset_index(drop= True)

In [460]:
long_sorted.head(30)

Unnamed: 0,iso_o,Country,Time,GDP,GDP_pct,population,trade_gdp_ratio,log_GDP,log_trade_gdp_ratio
0,ABW,Aruba,1999,1722905000.0,19216.2,89659.0,164.56,21.267278,5.103275
1,ABW,Aruba,2000,1873453000.0,20681.02,90588.0,145.07,21.351049,4.977216
2,ABW,Aruba,2001,1896457000.0,20740.13,91439.0,140.39,21.363253,4.944424
3,ABW,Aruba,2002,1961844000.0,21307.25,92074.0,133.23,21.39715,4.892077
4,ABW,Aruba,2003,2044112000.0,21949.49,93128.0,132.79,21.438229,4.888769
5,ABW,Aruba,2004,2254831000.0,23700.63,95138.0,132.43,21.536341,4.886054
6,ABW,Aruba,2005,2360017000.0,24171.84,97635.0,145.05,21.581935,4.977079
7,ABW,Aruba,2006,2469783000.0,24845.66,99405.0,141.04,21.627396,4.949044
8,ABW,Aruba,2007,2677641000.0,26736.31,100150.0,139.97,21.708202,4.941428
9,ABW,Aruba,2008,2843025000.0,28171.91,100917.0,139.11,21.768134,4.935265


In [461]:
#qq_sorted['first_diff_lnGDP'] = (qq_sorted.groupby(['iso_o', 'Time'])['log_GDP'].transform(first_difference))

In [462]:
long_sorted.to_csv('../data/cleaned/attributes2000_2023.csv', encoding='utf-8', index=False)