# GDP Analysis using Currency Indicators ( In Current US$)

This notebook consist of currency indicators such as Market Capitalization, Current GNI (Gross national product), Export of goods

In [None]:
import numpy as np #linear algebra
import pandas as pd #data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
from matplotlib import pyplot as plt

In [None]:
num_df = pd.read_csv(r"C:\Users\acer\GDP FINAL PRO\Data in CSV\Number Indiactors.csv", index_col=False)

In [None]:
num_df.head(10)

In [None]:
cur_df.tail()

In [None]:
cur_df.shape

In [None]:
cur_df.columns

In [None]:
cur_df['Country Name'].unique

In [None]:
print('number of missing data:')
print(cur_df.isnull().sum())
cur_df.describe(include='all')

### Data exploration

Correlation heatmap show the correlation between all numericall columns

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(data=cur_df.iloc[:,2:].corr(),annot=True,fmt='.2f',cmap='coolwarm')
plt.show()

#### top 20 countries with hioghest GDP growth

In [None]:
fig, ax = plt.subplots(figsize=(16,6))
top_gdp_countries = cur_df.sort_values('GDP (current US$) [NY.GDP.MKTP.CD]',ascending=False).head(20)
mean = pd.DataFrame({'Country Name':['World mean'], 'GDP (current US$) [NY.GDP.MKTP.CD]':[cur_df['GDP (current US$) [NY.GDP.MKTP.CD]'].mean()]})
gdps = pd.concat([top_gdp_countries[['Country Name','GDP (current US$) [NY.GDP.MKTP.CD]']],mean],ignore_index=True)
sns.barplot(x='Country Name', y='GDP (current US$) [NY.GDP.MKTP.CD]', data=gdps, palette='Set1')
ax.set_xlabel(ax.get_xlabel(), labelpad=15)
ax.set_ylabel(ax.get_ylabel(), labelpad=30)
ax.xaxis.label.set_fontsize(16)
ax.yaxis.label.set_fontsize(16)
plt.xticks(rotation=90)
plt.show()

In [None]:
# fig, axes = plt.subplots(nrows=3, ncols=3, figsize=(20,20))
# plt.subplots_adjust(hspace=0.4)

# corr_to_gdp = pd.Series(dtype=str)
# for col in cur_df.columns.values[2:]:
#     if ((col!='GDP (current US$) [NY.GDP.MKTP.CD]')&(col!='Exports of goods and services (current US$) [NE.EXP.GNFS.CD]')):
#         corr_to_gdp[col] = cur_df['GDP (current US$) [NY.GDP.MKTP.CD]'].corr(cur_df[col])
# abs_corr_to_gdp = corr_to_gdp.abs().sort_values(ascending=False)
# corr_to_gdp = corr_to_gdp.loc[abs_corr_to_gdp.index]

# for i in range(3):
#     for j in range(3):
#         sns.regplot(x=corr_to_gdp.index.values[i*3+j], y='GDP (current US$) [NY.GDP.MKTP.CD]', data=cur_df,
#                    ax=axes[i,j], fit_reg=False, marker='.')
#         title = 'correlation='+str(corr_to_gdp[i*3+j])
#         axes[i,j].set_title(title)
# axes[1,2].set_xlim(0,102)
# plt.show()

### Countries with Market capitalization of listed domestic companies and low GDP per capita

Some features, like Exports of goods and services, are related to the average GDP more linearly, while others ano not. For example, high market capitalization usually means low GDP in US$, but average GDP in low market capitalization of listed domestic countries can vary a lot.

In [None]:
cur_df.loc[(cur_df['Market capitalization of listed domestic companies (current US$) [CM.MKT.LCAP.CD]']<14) & (cur_df['GDP per capita (current US$) [NY.GDP.PCAP.CD]'] < 10000)]

### Data Modeling

In [None]:
cur_df.head(10)

label encode the categorical features 'Country Name', and while just use all features given in the dataset without further enginerring.

In [None]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_squared_log_error

In [None]:
LE = LabelEncoder()
cur_df['Country Label'] = LE.fit_transform(cur_df['Country Name'])
cur_df.head(10)

In [None]:
cur_df['Country Label'].unique

In [None]:
train, test = train_test_split(cur_df, test_size=0.3, shuffle=True)
training_features = ['Exports of goods and services (current US$) [NE.EXP.GNFS.CD]', 
                     'Exports of goods and services (current US$) [NE.EXP.GNFS.CD]',
                    'GDP per capita (current US$) [NY.GDP.PCAP.CD]',
                    'GNI (current US$) [NY.GNP.MKTP.CD]',
                    'Gross domestic savings (current US$) [NY.GDS.TOTL.CD]',
                    'Market capitalization of listed domestic companies (current US$) [CM.MKT.LCAP.CD]',
                    'Total reserves (includes gold, current US$) [FI.RES.TOTL.CD]',
                    'Present value of external debt (current US$) [DT.DOD.PVLX.CD]',
                    'Country Label']

target = 'GDP (current US$) [NY.GDP.MKTP.CD]'

train_X = train[training_features]
train_Y = train[target]
test_X = test[training_features]
test_Y = test[target]

In [None]:
print("X train shape:",train_X.shape)
train_X

In [None]:
print("Y tarin shape:",train_Y.shape)
train_Y

In [None]:
print("X test shape:",test_X.shape)
test_X

In [None]:
print("Y test shape:",test_Y.shape)
test_Y

In [None]:
model = LinearRegression()
model.fit(train_X, train_Y)

In [None]:
# pip install statsmodels

In [None]:
train_pred_Y = model.predict(train_X)
test_pred_Y = model.predict(test_X)
train_pred_Y = pd.Series(train_pred_Y.clip(0, train_pred_Y.max()), index=train_Y.index)
test_pred_Y = pd.Series(test_pred_Y.clip(0, test_pred_Y.max()), index=test_Y.index)

rmse_train = np.sqrt(mean_squared_error(train_pred_Y, train_Y))
msle_train = mean_squared_log_error(train_pred_Y, train_Y)
rmse_test = np.sqrt(mean_squared_error(test_pred_Y, test_Y))
msle_test = mean_squared_log_error(test_pred_Y, test_Y)

print('rmse_train:',rmse_train,'msle_train:',msle_train)
print('rmse_test:',rmse_test,'msle_test:',msle_test)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, plot_confusion_matrix, roc_curve
from sklearn.metrics import explained_variance_score
from sklearn import metrics

# R squared value
r_square = metrics.explained_variance_score(test_Y, test_pred_Y)

# printing evaluation metrics up to 4th decimal place
print('Testing Metrics for Linear Regression')
print('R Square:', r_square*100,'%')

### Conclusion

The model presents very high values of the R-squared: 0.99 or 99.98%, which means that our regressors explain a 99.8% of the overall variability of the dependent variable.

### Checking Using Stat Model and Line of best fits

We build the line of best fits with the GDP predicted by our model and the actual GDP. We see a strong alignment.

In [None]:
X = ['Exports of goods and services (current US$) [NE.EXP.GNFS.CD]', 
                     'Exports of goods and services (current US$) [NE.EXP.GNFS.CD]',
                    'GDP per capita (current US$) [NY.GDP.PCAP.CD]',
                    'GNI (current US$) [NY.GNP.MKTP.CD]',
                    'Gross domestic savings (current US$) [NY.GDS.TOTL.CD]',
                    'Market capitalization of listed domestic companies (current US$) [CM.MKT.LCAP.CD]',
                    'Total reserves (includes gold, current US$) [FI.RES.TOTL.CD]',
                    'Present value of external debt (current US$) [DT.DOD.PVLX.CD]',
                    'Country Label']

In [None]:
# We create our matrix of regressors (independent variables)
X=cur_df[X]

# We create our dependant variable
y=cur_df['GDP (current US$) [NY.GDP.MKTP.CD]']

In [None]:
# We create a linear regression object
lm = LinearRegression()

#fit model
lm.fit(X,y)

In [None]:
import statsmodels.formula.api as sm

# From the stats models we built our linear model.
model=lm.fit(X,y)

result = sm.ols(formula="y ~ X", data=cur_df).fit()
print(result.summary())

In [None]:
p=lm.predict(X)

In [None]:
plt.figure(num=3, figsize=(20, 10), dpi=90, facecolor='w', edgecolor='aqua')

sns.regplot(y, p, data=cur_df, marker='*', scatter_kws={"s": 350})

SIZE2= 
plt.rc('xtick', labelsize=SIZE2)    # fontsize of the tick X labels 
plt.rc('ytick', labelsize=SIZE2)    # fontsize of the tick Y labels


plt.title('Predicted GDP vs Actual GDP', size=30)
plt.xlabel('Actual value', size=26)
plt.ylabel('Predicted value', size=26)
plt.show()

In [None]:
Errors=(y-p)

print(Errors)

### Conclusion

Strong multicollinearity between the regressors