In [4]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

 # 1. Reading the data

In [5]:
all_gdp_df = pd.read_csv("/kaggle/input/gdp-of-all-countries19602020/gdp_1960_2020.csv");
all_gdp_df[:10]

 # 2. **Checking the consistency of country over years.**

In [75]:
count_in_each_year = pd.Series(all_gdp_df.groupby('year')['gdp'].count())
print(count_in_each_year)
# so the number of countries in each year is inconsistent.
print(count_in_each_year.min())


# Years with minimum number of countries.

In [78]:
#where gives Nan in the places where the conditions don't match. So, we use dropna().
print(count_in_each_year.where(count_in_each_year==103).dropna())

**Checking if these two years have same countries.**

In [20]:
filter1 = all_gdp_df['year']==1960
nations_1960 = pd.Series(all_gdp_df.where(filter1).dropna()['country'])
filter2 = all_gdp_df['year']==1961
nations_1961 = pd.Series(all_gdp_df.where(filter2).dropna()['country'])
print(nations_1961)
#the new series for 1961 is being indexed starting from 103. Need to reset it.

**Check for similarity**

In [21]:
#resetting the index of 1961 series.
nations_1961 = nations_1961.reset_index(drop = True)
nations_1960.compare(nations_1961)
#need to sort the country names
nations_1960_sorted = pd.Series(nations_1960.sort_values(ignore_index=True))
nations_1961_sorted = pd.Series(nations_1961.sort_values(ignore_index=True))
nations_1960_sorted.compare(nations_1961_sorted)
#so the countries in 1960 and 1961 are all same. 

In [22]:
#just checking if Nepal is included in these 103 countries.
nations_1960_sorted.where(nations_1960_sorted=="Nepal").dropna()

**For data consistency, I'll try to take these 103 countries only in other years too.**

In [72]:

#checking merging (join) query for two years.
#using astype(object) because where() upcasted the gdp to float representation(precision got lost).
nations_1962 = all_gdp_df.where(all_gdp_df.year==1962).astype('O').dropna()
nations_1960 = all_gdp_df.where(all_gdp_df.year==1960).astype('O').dropna()
merged = pd.merge(nations_1962, nations_1960, on=['country'], how='inner')

test_joined = nations_1960.join(nations_1962.set_index('country'), how='inner',lsuffix="_x", on='country')
test_filtered =  joined[['year', 'rank', 'country', 'state', 'gdp', 'gdp_percent']].copy()
test_summed = nations_1960.append(test_filtered)
test_summed.count()


In [85]:
# print(count_in_each_year)
#We have a series count_in_each_year that holds the years as index computed in previous cells.
df_for_1960 = all_gdp_df.where(all_gdp_df.year==1960).dropna()
#initializing empty df for storing all year's filtered dfs.
filtered_all_gdp_df = pd.DataFrame(columns=['year', 'rank', 'country', 'state', 'gdp', 'gdp_percent'])
for year, count in count_in_each_year.items():
    #get the dfs for each year
    df_for_year = all_gdp_df.where(all_gdp_df.year==year).dropna()
    #compare the dfs with 1960 year by ''country''
    joined = df_for_year.join(df_for_1960.set_index('country'), rsuffix='_1960', on='country', how='inner')
    #join returns columns of both left and right tables, so copying only the left columns.
    filtered_df_for_year =  joined[['year', 'rank', 'country', 'state', 'gdp', 'gdp_percent']].copy()
    #appending dfs of each year to our initial empty df.
    filtered_all_gdp_df = filtered_all_gdp_df.append(filtered_df_for_year)
    print(filtered_all_gdp_df['year'].count()/103)
    #the data still seems to be inconsistent in the middle years. :(
    #check for Nans in the new df.
print(filtered_df_for_year)

In [84]:
df = pd.DataFrame([[1, 2], [3, 4]], columns=list('AB'), index=['x', 'y'])
df2 = pd.DataFrame([[1, 2], [7, 8]], columns=list('AB'), index=['x', 'y'])
new_df = df.append(df2)
new_df

****