# Pandas, Matplotlib and Seaborn

In [2]:
import pandas as pd

[Dataset Soruce](https://www.kaggle.com/datasets/rajkumarpandey02/gdp-in-usd-per-capita-income-by-country)

CONTENT
<p>The figures presented here do not take into account differences in the cost of living in different countries, and the results vary greatly from one year to another based on fluctuations in the exchange rates of the country's currency. Such fluctuations change a country's ranking from one year to the next, even though they often make little or no difference to the standard of living of its population.

GDP per capita is often considered an indicator of a country's standard of living; however, this is inaccurate because GDP per capita is not a measure of personal income.

Comparisons of national income are also frequently made on the basis of purchasing power parity (PPP), to adjust for differences in the cost of living in different countries. (See List of countries by GDP (PPP) per capita.) PPP largely removes the exchange rate problem but not others; it does not reflect the value of economic output in international trade, and it also requires more estimation than GDP per capita. On the whole, PPP per capita figures are more narrowly spread than nominal GDP per capita figures.</p>

Here are some resources to learn about GDP:
#### [World Bank](https://data.worldbank.org/indicator/ny.gdp.pcap.cd?most_recent_value_desc=false)
#### [Our World in Data](https://ourworldindata.org/grapher/gdp-per-capita-worldbank)
#### [IMF](https://www.imf.org/external/datamapper/NGDPD@WEO/OEMDC/ADVEC/WEOWORLD)
#### [UN Data](https://data.un.org/Data.aspx?d=SNAAMA&f=grID%3A101%3BcurrID%3AUSD%3BpcFlag%3A1)



In [3]:
df = pd.read_csv("GDP (nominal) per Capita.csv",encoding= 'unicode_escape',  index_col=0)

## EDA (Exploratory Data Analysis)

### Use this section to explore and inspect dataset.

In [4]:
# number of countries per region
region_counts = df['UN_Region'].value_counts().reset_index()
region_counts.columns = ['Region', 'Number of Countries']
print(region_counts)



     Region  Number of Countries
0    Africa                   55
1      Asia                   51
2  Americas                   48
3    Europe                   48
4   Oceania                   20
5     World                    1


In [6]:
#What is European Union[n 1]?
eu_row = df[df['Country/Territory'].str.contains('European Union')]
print(eu_row.to_string(index=False))

  Country/Territory UN_Region  IMF_Estimate  IMF_Year  WorldBank_Estimate  WorldBank_Year  UN_Estimate UN_Year
European Union[n 1]    Europe         39940      2023               38411            2021        31875    2021


In [8]:
# Countries in Europe below average
europe_countries = df[(df['UN_Region'] == 'Europe') &
                      (~df['Country/Territory'].str.contains('European Union', na=False))]
europe_avg_gdp = europe_countries['IMF_Estimate'].mean()
below_avg = europe_countries[europe_countries['IMF_Estimate'] < europe_avg_gdp]
below_avg_sorted = below_avg.sort_values('IMF_Estimate')
print(f"European average GDP per capita (IMF 2023): ${europe_avg_gdp:,.2f}\n")
print("European countries below average:")
print(below_avg_sorted[['Country/Territory', 'IMF_Estimate']].to_string(index=False))


European average GDP per capita (IMF 2023): $34,329.87

European countries below average:
     Country/Territory  IMF_Estimate
                Monaco             0
         Liechtenstein             0
           Isle of Man             0
       Channel Islands             0
         Faroe Islands             0
               Ukraine          4654
                Kosovo          5641
               Moldova          6342
               Albania          7058
       North Macedonia          7384
               Belarus          7944
Bosnia and Herzegovina          8223
                Serbia         10849
            Montenegro         11289
                Russia         14403
              Bulgaria         14893
               Romania         18530
               Hungary         19385
                Poland         19912
               Croatia         20537
                Greece         22595
              Slovakia         23457
                Latvia         25136
              Portugal

In [9]:
# Which countries in Europe has higher GDP than UK?
uk_gdp = df[df['Country/Territory'] == 'United Kingdom']['IMF_Estimate'].values[0]
europe_countries = df[(df['UN_Region'] == 'Europe') &
                      (~df['Country/Territory'].str.contains('European Union', na=False))]
higher_than_uk = europe_countries[europe_countries['IMF_Estimate'] > uk_gdp]
higher_than_uk_sorted = higher_than_uk.sort_values('IMF_Estimate', ascending=False)

print(f"UK GDP per capita (IMF 2023): ${uk_gdp:,.2f}\n")
print("European countries with higher GDP per capita than the UK:")
print(higher_than_uk_sorted[['Country/Territory', 'IMF_Estimate']].to_string(index=False))

UK GDP per capita (IMF 2023): $46,371.00

European countries with higher GDP per capita than the UK:
Country/Territory  IMF_Estimate
       Luxembourg        132372
          Ireland        114581
           Norway        101103
      Switzerland         98767
          Iceland         75180
          Denmark         68827
      Netherlands         61098
          Austria         56802
           Sweden         55395
          Finland         54351
          Belgium         53377
       San Marino         52949
          Germany         51383


## groupby()

[Learn more about groupby](https://www.geeksforgeeks.org/pandas-groupby/)

In [15]:
grouped = df.groupby('UN_Region')['IMF_Estimate'].mean().reset_index()
rounded_grouped =grouped.round(2)
print(rounded_grouped)

  UN_Region  IMF_Estimate
0    Africa       2802.35
1  Americas      11871.04
2      Asia      16665.25
3    Europe      34446.75
4   Oceania       9133.15
5     World      13440.00


## Which countries below average by IMF world estimate?

In [21]:
#Step 1: Get world average GDP (IMF 2023 estimate)
world_avg = df[df['Country/Territory'] == 'World']['IMF_Estimate'].values[0]

# Step 2: Filter countries (exclude regions and world aggregate)
countries = df[(df['Country/Territory'] != 'World') &
               (~df['UN_Region'].isin(['World'])) &
               (~df['Country/Territory'].str.contains('European Union', na=False))]
 # Step 3: Find countries below world average
below_avg = countries[countries['IMF_Estimate'] < world_avg]

# Sort results by GDP (ascending)
below_avg_sorted = below_avg.sort_values('IMF_Estimate')

# Display results
print(f"World average GDP per capita (IMF 2023): ${world_avg:,.2f}\n")
print(f"Number of countries below average: {len(below_avg_sorted)}\n")
print("Countries below world average GDP:")
print(below_avg_sorted[['Country/Territory', 'UN_Region', 'IMF_Estimate']].to_string(index=True))

World average GDP per capita (IMF 2023): $13,440.00

Number of countries below average: 152

Countries below world average GDP:
                    Country/Territory UN_Region  IMF_Estimate
1                              Monaco    Europe             0
2                       Liechtenstein    Europe             0
5                             Bermuda  Americas             0
9                         Isle of Man    Europe             0
10                     Cayman Islands  Americas             0
14                    Channel Islands    Europe             0
15                      Faroe Islands    Europe             0
19                          Greenland  Americas             0
31             British Virgin Islands  Americas             0
37                  US Virgin Islands  Americas             0
39                      New Caledonia   Oceania             0
42                               Guam   Oceania             0
58          Sint Maarten (Dutch part)  Americas             0
61  

### IMF estimate 0 values

In [22]:
# Find rows where IMF_Estimate is 0 or NaN
zero_or_missing = df[(df['IMF_Estimate'] == 0) | (df['IMF_Estimate'].isna())]

# Display results
print(f"Countries with IMF Estimate = 0 or missing ({len(zero_or_missing)} found):")
print(zero_or_missing[['Country/Territory', 'UN_Region', 'IMF_Estimate']].to_string(index=False))

Countries with IMF Estimate = 0 or missing (26 found):
         Country/Territory UN_Region  IMF_Estimate
                    Monaco    Europe             0
             Liechtenstein    Europe             0
                   Bermuda  Americas             0
               Isle of Man    Europe             0
            Cayman Islands  Americas             0
           Channel Islands    Europe             0
             Faroe Islands    Europe             0
                 Greenland  Americas             0
    British Virgin Islands  Americas             0
         US Virgin Islands  Americas             0
             New Caledonia   Oceania             0
                      Guam   Oceania             0
 Sint Maarten (Dutch part)  Americas             0
  Northern Mariana Islands   Oceania             0
Saint Martin (French part)  Americas             0
  Turks and Caicos Islands  Americas             0
          French Polynesia   Oceania             0
              Cook Islands 

## Which country has highest UN Estimate?

In [24]:
# Find country with the highest UN Estimate
highest_un_estimate = df[df['UN_Estimate'] == df['UN_Estimate'].max()]

# Display the results
print(highest_un_estimate[['Country/Territory', 'UN_Region', 'UN_Estimate']].to_string(index=False))

Country/Territory UN_Region  UN_Estimate
           Monaco    Europe       234317


## Which country has highest Worlbank Estimate?

In [26]:
# Which conntry has the highest Workbank Estimate?
highest_wb_estimate = df[df['WorldBank_Estimate'] == df['WorldBank_Estimate'].max()]

# Display the results
print(highest_wb_estimate[['Country/Territory', 'UN_Region', 'WorldBank_Estimate']].to_string(index=False))

Country/Territory UN_Region  WorldBank_Estimate
           Monaco    Europe              234316


## Which country has highest IMF Estimate?

In [27]:
# Which country has highest IMF Estimate?
highest_imf_estimate = df[df['IMF_Estimate'] == df['IMF_Estimate'].max()]

# Display the results
print(highest_imf_estimate[['Country/Territory', 'UN_Region', 'IMF_Estimate']].to_string(index=False))

Country/Territory UN_Region  IMF_Estimate
       Luxembourg    Europe        132372


## Filling 0 Values by average

In [29]:
import numpy as np

In [32]:
# replace 0 with null values
df['IMF_Estimate'] = df['IMF_Estimate'].replace(0, np.nan)

# Display the results
print(df[['Country/Territory', 'UN_Region', 'IMF_Estimate']].to_string(index=False))

               Country/Territory UN_Region  IMF_Estimate
                          Monaco    Europe           NaN
                   Liechtenstein    Europe           NaN
                      Luxembourg    Europe      132372.0
                         Ireland    Europe      114581.0
                         Bermuda  Americas           NaN
                          Norway    Europe      101103.0
                     Switzerland    Europe       98767.0
                       Singapore      Asia       91100.0
                     Isle of Man    Europe           NaN
                  Cayman Islands  Americas           NaN
                           Qatar      Asia       83891.0
                   United States  Americas       80034.0
                         Iceland    Europe       75180.0
                 Channel Islands    Europe           NaN
                   Faroe Islands    Europe           NaN
                         Denmark    Europe       68827.0
                       Australi

In [34]:
# Calculate the average of 'Worldbank_Estimate' and 'UN_Estimate' columns
avg_worldbank_un = df[['WorldBank_Estimate', 'UN_Estimate']].mean(axis=1)

# Display the results
print(avg_worldbank_un)


1      234316.5
2      163507.5
3      133667.5
4      100640.5
5      113371.5
         ...   
219       624.0
220       736.0
221       492.5
222       371.0
223       266.5
Length: 223, dtype: float64


In [None]:
# Fill the null values in 'imf' column with the calculated average
df['IMF_Estimate'] = df['IMF_Estimate'].fillna(avg_worldbank_un)

# Display the results
print(df[['Country/Territory', 'UN_Region', 'IMF_Estimate']].to_string(index=False))


In [None]:
# Drop the temporary 'avg_worldbank_un' column if not needed


[Visit this link to learn more about ffill](https://www.w3schools.com/python/pandas/ref_df_ffill.asp)

[Visit this link to learn more about bfill](https://www.w3schools.com/python/pandas/ref_df_bfill.asp)

## Checking Missing Values

## Visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

### Histogram

In [None]:
df.hist(figsize=(10,8))
plt.show()

In [None]:
df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].hist(figsize=(12,9))

plt.show()

In [None]:
df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].hist(bins=5, figsize=(12,9))

plt.show()

In [None]:
df["WorldBank_Estimate"].agg(["min","max"])

In [None]:
234316/5
#1 bin size if bins=5

In [None]:
df[df["WorldBank_Estimate"]<=46863.2]["WorldBank_Estimate"].count()

In [None]:
234316/10
#1 bin size if bins not given any number

In [None]:
df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].hist(bins=3, figsize=(12,9))

plt.show()

In [None]:
df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].hist(bins=15, figsize=(15,12))

#23400/15 = 15300
plt.show()

### Correlation Heatmap

In [None]:
df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].corr()

In [None]:
corr = df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].corr()

plt.figure(figsize=(9,6))
sns.heatmap(corr)

plt.show()

In [None]:
corr = df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].corr()

plt.figure(figsize=(9,6))

sns.heatmap(corr, annot=True)

plt.show()

In [None]:
corr = df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].corr()

plt.figure(figsize=(9,6))

sns.heatmap(corr, annot=True, fmt=".2f", cmap = 'GnBu', annot_kws={"size": 12})

plt.show()

In [None]:
corr = df[["IMF_Estimate", "UN_Estimate", "WorldBank_Estimate"]].corr()

plt.figure(figsize=(9,6))

sns.heatmap(corr, annot=True, cmap = 'Purples')

plt.title("Correlation Map")


plt.show()

In [None]:
corr = df.select_dtypes(include=[int, float]).corr()

plt.figure(figsize=(9,6))

sns.heatmap(corr, annot=True, cmap = 'Purples')

plt.show()

### Bar plot

In [None]:
df.head()

In [None]:
sns.barplot(x="UN_Region", y="WorldBank_Estimate", data=df, errorbar=None)

plt.show()

In [None]:
sns.barplot(x="WorldBank_Estimate", y="UN_Region", data=df, errorbar=None)

plt.show()

In [None]:
fig = plt.figure(figsize = (8,5))

ax = sns.barplot(x = "IMF_Estimate",  y = "UN_Region",
data = df, errorbar = None)

ax.bar_label(ax.containers[0])

plt.show()

In [None]:
fig = plt.figure(figsize = (8,5))
ax = sns.barplot(x = "UN_Region",  y = "IMF_Estimate",
                 data = df, errorbar = None)

ax.bar_label(ax.containers[0])


ax.set_title("Regions by IMF Estimate")
plt.show()

### Scatter Plot

In [None]:
df.plot(x='UN_Region', y='UN_Estimate', kind='scatter',
        figsize=(10,6),
        title="Scatter Plot")

plt.show()

### Boxplot and Outliers

![image.png](attachment:da6a7715-3b1c-4165-a2a9-4dc8ca096bd7.png)

In [35]:
sns.boxplot(x=df["UN_Estimate"])

plt.show()

NameError: name 'sns' is not defined

In [None]:
df[df["UN_Estimate"]>50000].head()

In [None]:
sns.boxplot(x=df["WorldBank_Estimate"])

plt.show()

In [None]:
sns.boxplot(x=df["IMF_Estimate"])

plt.show()

In [None]:
df[df["UN_Estimate"]>100000]

In [None]:
df.UN_Estimate.mean()

In [None]:
df.shape

## Create another dataframe called data excluding  5 countries with highest UN estimate

In [None]:
data = df[-(df["UN_Estimate"]>100000)]

In [None]:
data.head()

In [None]:
data.shape

In [None]:
data.UN_Estimate.mean()

In [None]:
df.UN_Estimate.mean()

![image.png](attachment:image.png)

In [None]:
sns.boxplot(x=data["UN_Estimate"])
plt.show()

## Removing outliers

![image.png](attachment:da6a7715-3b1c-4165-a2a9-4dc8ca096bd7.png)

![image.png](attachment:image.png)

In [None]:
lower_q = df["UN_Estimate"].quantile(0.25)
lower_q

In [None]:
higher_q = df["UN_Estimate"].quantile(0.75)
higher_q

In [None]:
iqr = higher_q - lower_q
iqr

In [None]:
upper_boundary = higher_q + 1.5 * iqr
upper_boundary

In [None]:
lower_boundary = lower_q - 1.5 * iqr
lower_boundary

In [None]:
df_filtered = df[(df["UN_Estimate"] < upper_boundary) & (df["UN_Estimate"] > lower_boundary)]

In [None]:
df_filtered.head()

In [None]:
df_filtered.shape
# there were 223 rows - 196 = 27 outliers dropped

In [None]:
df_filtered.UN_Estimate.mean()

In [None]:
df.UN_Estimate.mean()

In [None]:
#how can we create a table with following
df_filtered.WorldBank_Estimate.mean()

In [None]:
df.WorldBank_Estimate.mean()

In [None]:
df_filtered.IMF_Estimate.mean()

In [None]:
df.IMF_Estimate.mean()