# A story of change:

This notebook investigates how and if Kagglers have changed with regards to: geographical location, gender diversity, and type of ‘employment’. 

I compared the results for the 2018 survey to the 2021 survey.

[Appendix](#Appendix)
: please see the [Appendix](#Appendix) for the code used to create the additional dataset and the list of data sources


In [None]:
# imports 
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

import warnings
warnings.filterwarnings("ignore")

Choosing among the past surveys for a suitable candidate.

In [None]:
# importing df (data frame) collecting the totals for surveys 2018, 2019, 2020, 2021
tot_all = pd.read_csv('../input/survey-totals-merged-data/totals_by_country_18_20.csv')
totals = tot_all.drop(['Countries'], axis=1).sum().to_dict()
kagglers_tot = pd.DataFrame.from_dict(totals,  orient='index').reset_index()
kagglers_tot.columns=['Year', 'Total']
kagglers_tot

Visualize the data

In [None]:
import matplotlib.pyplot as plt

fig, ax = plt.subplots(figsize=(4,6))  
kagglers_tot.plot.bar(width=.8, ax = ax,color='mediumseagreen' )

ax.set_xticklabels(kagglers_tot['Year'],  rotation = 45,)
plt.title('Year on Year Totals')
plt.ylabel('Count')

We can see that there has been a drop in respondents in the years since the 2018 survey followed by a marked increase in 2021; this suggests that the community changes over time .

# Where were and where are the Kagglers

A birds eye view of the geographical location of the Kagglers: 

Changes in the geographical distribution of Kagglers(absolute numbers)  between the 2018 and the 2021 surveys

In [None]:
#importing data
tots_cols = ['Countries','Totals_2018', 'Totals_2021']
tot = pd.read_csv('../input/additional-data-wiki-22nov/additional_data22Nov_wiki.csv', usecols=tots_cols)
tot.head()

In [None]:
#Removing 'Other' and 'I do not wish to disclose...' from df
tot.drop(3, inplace=True)
tot.drop(12, inplace=True)

In [None]:
#visualise the Kagglers

import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot,plot


"""" code sourced at https://www.kdnuggets.com/2020/09/geographical-plots-python.html """

data = dict(
        type = 'choropleth',
        colorscale = 'rainbow',
        locations = tot['Countries'],
        locationmode = "country names",
        z = tot['Totals_2018'],
        text = tot['Countries'],
        colorbar = {'title' : 'Kagglers 2018'}
      )

layout = dict(title = '2018 Kaggle Surevey .' , geo = dict(projection = {'type':'natural earth'}))
choromap = go.Figure(data = [data],layout = layout)

choromap.update_geos(resolution=50,)
iplot(choromap,validate=False)

data1 = dict(
        type = 'choropleth',
        colorscale = 'rainbow',
        locations = tot['Countries'],
        locationmode = "country names",
        z = tot['Totals_2021'],
        text = tot['Countries'],
        colorbar = {'title' : 'Kagglers 2021'},
      )



layout1 = dict(title = '2021 Kaggle Surevey .', geo = dict(projection = {'type':'natural earth'}))
choromap1 = go.Figure(data = [data1],layout = layout1)

choromap1.update_geos(resolution=50,)
iplot(choromap1,validate=False)

In [None]:
most18abs = tot.sort_values(by='Totals_2018', ascending=False).head(5)
mostabs = tot.sort_values(by='Totals_2021', ascending=False).head(5)

print('Most Kagglers in absolute numbers: \n\n 2018 \n', most18abs[['Countries','Totals_2018']] ,'\n\n 2021 \n', mostabs[['Countries','Totals_2021']])


In absolute numbers we have seen which countries have the highest number of Kagglers and how these numbers have changed. However, a number of Kagglers are in countries with populations that are a fraction of nations like India or the US, and in order to better appreciate where Kaggling is getting more or less popular, I will relate number of Kagglers to their country population. 

In [None]:
#Importing data
pops_cols = ['Countries','Totals_2018', 'Totals_2021', 'Population_2018', 'Population_2021']
tot_by_pop = pd.read_csv('../input/additional-data-wiki-22nov/additional_data22Nov_wiki.csv', usecols=pops_cols)
#Removing 'Other' and 'I do not wish to disclose...' from df
tot_by_pop.drop(3, inplace=True)
tot_by_pop.drop(12, inplace=True)
tot_by_pop.head()

In [None]:
#Turning Kaggle numbers into % of country population
tot_by_pop['kag_%_21'] =(tot_by_pop['Totals_2021'])/(tot_by_pop['Population_2021'])
tot_by_pop['kag_%_18'] =(tot_by_pop['Totals_2018'])/(tot_by_pop['Population_2018'])

In [None]:
#visualise the Kagglers as % of population

import plotly.graph_objs as go 
from plotly.offline import init_notebook_mode,iplot,plot


"""" code sourced at https://www.kdnuggets.com/2020/09/geographical-plots-python.html """

data = dict(
        type = 'choropleth',
        colorscale = 'rainbow',
        locations = tot_by_pop['Countries'],
        locationmode = "country names",
        z = tot_by_pop['kag_%_18'],
        text = tot_by_pop['Countries'],
        colorbar = {'title' : 'Kagglers 2018 % of population'}
      )

layout = dict(title = '2018 Kaggle Surevey .' , geo = dict(projection = {'type':'natural earth'}))
choromap = go.Figure(data = [data],layout = layout)

choromap.update_geos(resolution=50,)
iplot(choromap,validate=False)

data1 = dict(
        type = 'choropleth',
        colorscale = 'rainbow',
        locations = tot_by_pop['Countries'],
        locationmode = "country names",
        z = tot_by_pop['kag_%_21'],
        text = tot_by_pop['Countries'],
        colorbar = {'title' : '% of population'},
      )



layout1 = dict(title = '2021 Kaggle Surevey .', geo = dict(projection = {'type':'natural earth'}))
choromap1 = go.Figure(data = [data1],layout = layout1)

choromap1.update_geos(resolution=50,)
iplot(choromap1,validate=False)

In [None]:
most18 = tot_by_pop.sort_values(by='kag_%_18', ascending = False).head(5)
most = tot_by_pop.sort_values(by='kag_%_21', ascending=False).head(5)

print('Most Kagglers as proportion of the population: \n\n 2018 \n', most18[['Countries','kag_%_18']] ,'\n\n 2021 \n', most[['Countries','kag_%_21']])

Visualize which countries lost or gained Kagglers

In [None]:
tot['diff_18_21']= (tot['Totals_2021']-tot['Totals_2018'])/(tot['Totals_2021']+tot['Totals_2018']) #turning the difference into -1,1 range 

tot.sort_values(by='Countries', inplace=True)

import matplotlib.pyplot as plt

tot['positive'] = tot['diff_18_21']> 0

fig, ax = plt.subplots(figsize=(24,6))  
tot['diff_18_21'].plot.bar( width=.6,ax = ax,color=tot.positive.map({True: 'b', False: 'r'}))


ax.set_xticklabels(tot['Countries'])

plt.title('Numbers of Kagglers: gainers and losers.')

# Gender

So far we have see that there has been a degree of change in the number of Kagglers and their location. Relating the numbers and locations changes to gender will provide the next insight in how the community might be changing.

In [None]:
# Get gender numbers per country from 2018 survey
gender18 = ['Q1', 'Q3']
gen_18 = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv', usecols=gender18)
gen_18.columns=['Gender', 'Country']
gen_18.drop(0, inplace=True) # drop question text

In [None]:
# Get gender numbers per country from 2021 survey
gender21 = ['Q2' ,'Q3']
gen_21 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', usecols=gender21)
gen_21.columns=['Gender', 'Country']
gen_21.drop(0, inplace=True) # drop question text

In [None]:
gender_2018 = gen_18['Gender'].value_counts().rename_axis('Gender').reset_index(name = '2018')
gender_2018['Gender'] =gender_2018['Gender'].str.replace('Male','Man')
gender_2018['Gender'] = gender_2018['Gender'].str.replace('Female','Woman')
print('Totals by gender for 2018 survey:')
gender_2018

In [None]:
gender_2021 = gen_21['Gender'].value_counts().rename_axis('Gender').reset_index(name = '2021')
print('Totals by gender for 2021 survey:')
gender_2021

In [None]:
gender_2018_tot =gender_2018['2018'].sum()
man_18 =((gender_2018.iloc[0, 1:].to_list())/gender_2018_tot)*100
woman_18 = ((gender_2018.iloc[1, 1:].to_list())/gender_2018_tot)*100
prefer_not18 = ((gender_2018.iloc[2, 1:].to_list())/gender_2018_tot)*100
prefer_self18= ((gender_2018.iloc[3, 1:].to_list())/gender_2018_tot)*100

gender_2021_tot =gender_2021['2021'].sum()
man_21 =((gender_2021.iloc[0, 1:].to_list())/gender_2021_tot)*100
woman_21 = ((gender_2021.iloc[1, 1:].to_list())/gender_2021_tot)*100
prefer_not = ((gender_2021.iloc[2, 1:].to_list())/gender_2021_tot)*100
non_bi= ((gender_2021.iloc[3, 1:].to_list())/gender_2021_tot)*100
prefer_self= ((gender_2021.iloc[4, 1:].to_list())/gender_2021_tot)*100

print('Kagglers in percentages 2018: \n man: ' ,man_18, '\n woman: ',woman_18, '\n prefer not to say:',prefer_not18, '\n prefer to self describe:', prefer_self18)

print('\nKagglers in percentages 2021: \n man: ' ,man_21, '\n woman: ',woman_21, '\n prefer not to say:',prefer_not,'\n prefer to self describe:', prefer_self, '\n nonbinary',non_bi, )

In [None]:
#Prepping data for visualization

pic18d = {'man': 81.43677438, 'woman': 16.8070749, 'prefer not to say': 1.42503877, 'prefer to self describe': 0.33111195}

pic18 = pd.DataFrame.from_dict(pic18d,orient='index').reset_index()
pic18.columns = (['Gender', 'Percent'])

pic21d = {'man': 79.30543256, 'woman': 18.82724368, 'prefer not to say': 1.36680399, 'prefer to self describe': (0.16170639+0.33881338)} #clubbing nonbinary with prefer to self describe for viz. comaparison ONLY
pic21 = pd.DataFrame.from_dict(pic21d,orient='index').reset_index()
pic21.columns = (['Gender', 'Percent'])

Overall changes in gender distribution <a id='18.8%'></a>

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = pic18['Gender']
labels1 = pic21['Gender']

fig = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['2018', '2021'])
fig.add_trace(go.Pie(labels=labels, values=pic18['Percent'], scalegroup='one',
                     name="Gender % in  2018"), 1, 1)
fig.add_trace(go.Pie(labels=labels1, values=pic21['Percent'], scalegroup='one',
                     name="Gender % in  2021"), 1, 2)

fig.update_layout(title_text='Kagglers %')
fig.show()

In [None]:
gen_country18 = gen_18.groupby(['Country', 'Gender'])['Country'].count().unstack('Gender').fillna(0).reset_index()
gen_country18 = gen_country18.sort_values(by='Female', ascending=False)
gen_country18['Total'] = gen_country18.sum(axis=1)
gen_country18.head(5).rename_axis('2018')

gen_country = gen_21.groupby(['Country', 'Gender'])['Country'].count().unstack('Gender').fillna(0).reset_index()
gen_country = gen_country.sort_values(by='Woman', ascending=False)
gen_country['Total'] = gen_country.sum(axis=1)
gen_country.head(5).rename_axis('2021')

print( 'Top 5 countries with highest numbers of female Kagglers in 2018: \n ')
print(gen_country18[['Country', 'Female']].head(5))

print( '\nTop 5 countries with highest numbers of female Kagglers in 2021: \n ')
print(gen_country[['Country', 'Woman']].head(5))

In line with overall numbers, the data shows that female only participation has nearly halved in the US, while it has increased in India in line with the increase of Kagglers in the country. 

However, to better understand female participation,  I will look at the number of women as a percentage of the total number of Kagglers in each country. Although this strategy is more vulnerable to distortion especially in countries with very few Kagglers,  it offers a more informative view of gender distributions.


In [None]:
#Prepping the data 2018
gen_country18['%Women'] = (gen_country18['Female']/gen_country18['Total'])*100

#the numbers for 'Prefer not to say, 	Prefer to self-describe' are low and might get lost in the viz, so for viz purposes ONLY I'm clubbing them under a new category called ''NonB_PrNtS_PrSlDs' (from the headers Initails )
gen_country18['%NonB_PrNtS_PrSlDs']=((gen_country18[['Prefer not to say','Prefer to self-describe']].sum(axis=1))/gen_country18['Total'])*100

gen_country18['%Men']= (gen_country18['Male']/gen_country18['Total'])*100

sort18_w =  gen_country18.sort_values(by='%Women')
sort18_w.head()

In [None]:
#Prepping the data 2021
gen_country['%Women'] = (gen_country['Woman']/gen_country['Total'])*100

#the numbers for 'Nonbinary, 	Prefer not to say, 	Prefer to self-describe' are low and might get lost in the viz, so for viz purposes ONLY I'm clubbing them under a new cat called ''NonB_PrNtS_PrSlDs' (from the headers Initails )

gen_country['%NonB_PrNtS_PrSlDs']=((gen_country[['Nonbinary','Prefer not to say','Prefer to self-describe']].sum(axis=1))/gen_country['Total'])*100

gen_country['%Men']= (gen_country['Man']/gen_country['Total'])*100

sort_w =  gen_country.sort_values(by='%Women')
sort_w.head()

In [None]:
tail_5_18=sort18_w.head(5)
head_5_18= sort18_w.tail(5)
tail_5=sort_w.head(5)
head_5= sort_w.tail(5)

print('                         Female Kagglers , as a percentage of the country population\n')
print(' Most Female partecipation in 2018:\n', head_5_18['Country'].reset_index().drop(['index'], axis=1), '\n', '\n Lowest Female partecipation in 2018:\n', tail_5_18['Country'].reset_index().drop(['index'], axis=1))
print('\n Most Female partecipation in 2021:\n', head_5['Country'].reset_index().drop(['index'], axis=1), '\n', '\n Lowest Female partecipation in 2021:\n', tail_5['Country'].reset_index().drop(['index'], axis=1))

In [None]:
#combining df for 2018, 2019
part_w1 = gen_country[['Country','%Women',]].merge(gen_country18[['Country','%Women',]], how='outer', left_on=['Country'], right_on=['Country'],suffixes=('_2021', '_2018')).fillna(0)
part_w= part_w1.copy()

In [None]:
#Calculating the change in female partecipation range -1,1
part_w['%diff'] = (part_w['%Women_2021']-part_w['%Women_2018'])/(part_w['%Women_2021']+part_w['%Women_2018'])

part_w_diff = part_w[['Country','%diff']]
part_w_diff.sort_values(by='Country', inplace=True)
part_w_diff.head()

In [None]:
import matplotlib.pyplot as plt

part_w_diff['posi'] = part_w_diff['%diff']> 0.0

fig, ax = plt.subplots(figsize=(24,6))  
part_w_diff['%diff'].plot.bar( width=.6, ax = ax, color=part_w_diff['posi'].map({True: 'b', False: 'r'}))#stacked=True)


ax.set_xticklabels(part_w_diff['Country'])

plt.title('Female Partecipation changes between 2018 and 2021 by country')

**Focus** on : how much of the increase in Kagglers in each country is female driven

In [None]:
# Prepping the data
viz_inc18 = gen_country18[['Country','Female','Male']]
viz_inc21 = gen_country[['Country','Man','Woman']]

In [None]:
import numpy as np
viz_merge = viz_inc21.merge(viz_inc18, how='outer', left_on=['Country'], right_on=['Country'], suffixes=('_2018', '_2021'))
viz_merge['tot_m_f_2018'] = viz_merge['Female']+viz_merge['Male']
viz_merge['tot_m_f_2021'] = viz_merge['Woman']+viz_merge['Man']
viz_merge['tot_incr/dcr'] = (viz_merge['tot_m_f_2021']-viz_merge['tot_m_f_2018'])/(viz_merge['tot_m_f_2021']+viz_merge['tot_m_f_2018'])
viz_merge['tot_incr/dcr_w'] =(viz_merge['Woman']-viz_merge['Female'])/(viz_merge['Woman']+viz_merge['Female'])
viz_merge['tot_incr/dcr_m'] =(viz_merge['Man']-viz_merge['Male'])/(viz_merge['Man']+viz_merge['Male'])
filt_part= viz_merge['tot_incr/dcr']>=0
posi_only = viz_merge[filt_part]
posi_only.head()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc
import pandas as pd

 
fig, ax = plt.subplots(1,1,figsize=(24,8))

posi_only[['tot_incr/dcr_m','tot_incr/dcr_w']].plot(kind='bar', ax=ax, color=['purple','cyan'])#stacked=True,


ax.set_xticklabels(posi_only['Country'], rotation=90)

ax.legend(['Male increse', 'Female increase'])
plt.title('Visualising how much of the Kagglers increase is female driven. \n (Only countries with increased number of Kagglers).')
plt.show()

We have seen that overall female participation is around [18.8%](#18.8%)in 2021.

**Focus** on : which countries have higher than 18.8% female participation

In [None]:
filt_ab = part_w['%Women_2021']>18.8
above_avg_fem = part_w[filt_ab].sort_values(by='%Women_2021', ascending=False)
above_avg_fem= above_avg_fem[['Country','%Women_2021']]
print('Above 18.8% female partecipation:\n')
above_avg_fem

Stellar female participation in Tunisia. 😀

Interestingly, some of the countries that rank quite low in the Gender Equality Index (see: https://en.wikipedia.org/wiki/Global_Gender_Gap_Report) have higher proportion of female participation than 18.8% (overall survey total % 2021). 

A number of factors may be at play here, such as improved prospects for women in these countries,  AI/ML being perceived as a socially acceptable work/study-subject for women, the 'need' for the skill-set taking priority over custom and tradition. Or simply, the Equality Index does not work well for all countries and all cultures.


# Current Roles

This section dives into Kagglers' roles, it aims to highlight increase/decrease diversity in the work/study place.

In [None]:
#get data 2018
work =['Q6']
work_2018=pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv', usecols=work)
emp_2018=work_2018.value_counts().rename_axis('Employment').reset_index(name='Totals').drop(21)#dropping question text


In [None]:
#get data 2021
import pandas as pd
work_cols = ['Q5']
work_2021 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', usecols=work_cols)
emp_2021 = work_2021.value_counts().rename_axis('Employment').reset_index(name='Totals').drop(15) #dropping question text


In [None]:
print('The top answer to the "current role" question in 2018 was : ', '"',emp_2018.iloc[0,0],'"' ', with a total of ',emp_2018.iloc[0,1],'.' )
print('The top answer to the "current role" question in 2021 was : ', '"',emp_2021.iloc[0,0],'"' ', with a total of ',emp_2021.iloc[0,1],'.' )

Although the top answer to the 'Role' question is student, it does not follow that the majority of Kagglers are students. In order to address this, I will re-group the answers to the 'Role' questions in: 'Full Time Education', 'Employed' and 'Un-employed'


In [None]:
#prep data 2018
employed18= emp_2018.drop(6) # take out students
employed18sum = employed18.iloc[1:].sum().rename_axis('Emp18').reset_index(name='Tots').drop(0)# get employed
unemployed18 = emp_2018.iloc[6].reset_index() #get not-emplyed

In [None]:
#prep data 2021 : 
employed21= emp_2021.drop(5) # take out students
employed21sum = employed21.iloc[1:].sum().rename_axis('Emp21').reset_index(name='Tots').drop(0) # get employed
unemployed21 = emp_2021.iloc[5].reset_index() #get not-emplyed

In [None]:
l_2018=[emp_2018.iloc[0,1],employed18sum.iloc[0,1],unemployed18.iloc[1,1]]
l_2021=[emp_2021.iloc[0,1],employed21sum.iloc[0,1],unemployed21.iloc[1,1]]
col_name=['F/Teducation' , 'Empl', 'Not_emp']

In [None]:
d = {'2018':(l_2018), '2021':(l_2021)}
role_tot =pd.DataFrame.from_dict(d, orient='index',  columns=col_name)#.reset_index()


ts = role_tot.T.reset_index()
ts

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = ts['index']
fig = make_subplots(1, 2, specs=[[{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['2018', '2021'])
fig.add_trace(go.Pie(labels=labels, values=ts['2018'], scalegroup='one',
                     name="Roles in  2018"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=ts['2021'], scalegroup='one',
                     name="Roles in 2021"), 1, 2)

fig.update_layout(title_text='Kagglers Roles')
fig.show()



The above graphs suggest that consistently across the 2 surveys, the majority of Kagglers are in Employment rather than full time education. 
Lets' look a little deeper at the role distribution among different genders

In [None]:
#get data 2018
emp_gen18 = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv', usecols=['Q1','Q6'])
emp_gen18.drop(index=0, inplace=True) 
emp_gen18.columns=['Gender', 'Role']

#group data and get gender totals
role_gen_18 = emp_gen18.groupby(['Gender', 'Role'])['Role'].count().unstack('Gender').fillna(0).reset_index()

#prep data for viz
role_gen_18_stu =role_gen_18.iloc[20]
role_gen_18_unemp = role_gen_18.iloc[11]
role_gen_18_emp =role_gen_18.drop(20).drop(11).sum()


role_gen_18_role = pd.concat([role_gen_18_stu, role_gen_18_unemp, role_gen_18_emp], axis=1).T
role_gen_18_role.replace(role_gen_18_role.iloc[2,0], 'Employed', inplace=True)

role_gen_18_role


In [None]:
employed_all_2018 = role_gen_18_role.iloc[2, 1:].sum()

In [None]:
#get data 2021
emp_gen21 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', usecols=['Q2','Q5'])
emp_gen21.drop(index=0, inplace=True) 
emp_gen21.columns=['Gender', 'Role']
role_gen_21 = emp_gen21.groupby(['Gender', 'Role'])['Role'].count().unstack('Gender').fillna(0).reset_index()

role_gen_21 = emp_gen21.groupby(['Gender', 'Role'])['Role'].count().unstack('Gender').fillna(0).reset_index()



emp_gen21_stu =role_gen_21.iloc[14]
emp_gen21_unemp = role_gen_21.iloc[1]
emp_gen21_emp =role_gen_21.drop(14).drop(1).sum()

emp_gen21_roles = pd.concat([emp_gen21_stu, emp_gen21_unemp, emp_gen21_emp], axis=1).T
emp_gen21_roles.replace(emp_gen21_roles.iloc[2,0], 'Employed', inplace=True)

# The group 'Non Binary' was not present in the 2018 survey, in order to keep the comparison between the two surveys balance, I am grouping 'Non-Binary' with 'Prefer to Self Descibe'
emp_gen21_roles['Bnry_PrSf']= emp_gen21_roles['Nonbinary']+ emp_gen21_roles['Prefer to self-describe']
emp_gen21_roles = emp_gen21_roles.drop(['Nonbinary', 'Prefer to self-describe'], axis=1)
emp_gen21_roles

In [None]:
employed_all_2021 = emp_gen21_roles.iloc[2, 1:].sum()

In [None]:
import plotly.graph_objects as go
from plotly.subplots import make_subplots

labels = role_gen_18_role['Role']
fig = make_subplots(2, 4, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'}, {'type':'domain'}],
                                 [{'type':'domain'}, {'type':'domain'},{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['Man 2018', 'Woman 2018', 'Not_say 2018','PrSf 2018','Man 2021', 'Woman 2021', 'Not_say 2021','NoNB+PrSf 2021'])
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Male'], name="Roles"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Female'], name="Roles"), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Prefer not to say'],name="Roles"), 1, 3)
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Prefer to self-describe'],name="Roles"), 1, 4)

fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Man'],name="Roles"), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Woman'],name="Roles"), 2, 2)
fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Prefer not to say'], name="Roles"), 2, 3)
fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Bnry_PrSf'],name="Roles"), 2, 4)

fig.update_layout(title_text='Kagglers Roles ')
fig.show()


Before moving on: 

**Reality-Check** : re-introducing how the defferent group sizes relate to each other.


In [None]:
labels = role_gen_18_role['Role']
fig = make_subplots(2, 4, specs=[[{'type':'domain'}, {'type':'domain'},{'type':'domain'}, {'type':'domain'}],
                                 [{'type':'domain'}, {'type':'domain'},{'type':'domain'}, {'type':'domain'}]],
                    subplot_titles=['Man 2018', 'Woman 2018', 'Not_say 2018','PrSf 2018','Man 2021', 'Woman 2021', 'Not_say 2021','NoNB+PrSf 2021'])
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Male'], scalegroup='one',
                     name="Roles"), 1, 1)
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Female'], scalegroup='one',
                     name="Roles"), 1, 2)
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Prefer not to say'], scalegroup='one',
                     name="Roles"), 1, 3)
fig.add_trace(go.Pie(labels=labels, values=role_gen_18_role['Prefer to self-describe'], scalegroup='one',
                     name="Roles"), 1, 4)

fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Man'], scalegroup='one',
                     name="Roles"), 2, 1)
fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Woman'], scalegroup='one',
                     name="Roles"), 2, 2)
fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Prefer not to say'], scalegroup='one',
                     name="Roles"), 2, 3)
fig.add_trace(go.Pie(labels=labels, values=emp_gen21_roles['Bnry_PrSf'], scalegroup='one',
                     name="Roles"), 2, 4)

fig.update_layout(title_text='Kagglers Roles ')
fig.show()

Across the surveys, unemployment is higher for women. Unemployment is high among 'Prefer not to say', 'Self Describe' and 'Non Binary' genders, but the small number of respondents in these groups may adversely affect the quality of the insights that might be gained.

**Age and Unemployment:**
Looking at age distributions among the genders may offer an insight into why certain groups may be more at risk of unemployment.

In [None]:
# getting the data 2018
age18 = ['Q1','Q2', 'Q3']
age_18 = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv', usecols=age18)
age_18 = age_18[['Q2','Q1', 'Q3']] # reordering for ease of use
age_18['Q1'] =age_18['Q1'].str.replace('Male','Man') #rename: to make it the same naming as in 2021
age_18['Q1'] = age_18['Q1'].str.replace('Female','Woman')#rename: to make it the same naming as in 2021
age_18.drop(index=0, inplace=True) # drop questions row
age_18.columns=['Age', 'Gender', 'Country']

In [None]:
# getting the data 2021
age21 =['Q1','Q2', 'Q3']
age_21 = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', usecols=age21)
age_21.drop(index=0, inplace=True) 
age_21.columns=['Age', 'Gender', 'Country']

In [None]:
age21_tot = age_21['Age'].value_counts().rename_axis('Age').reset_index(name = 'Totals21')
age18_tot = age_18['Age'].value_counts().rename_axis('Age').reset_index(name = 'Totals18')

# For more accurate visuals when comparing the 2 sureveys,  I am turning the totals in percentages
temp18 = age18_tot['Totals18'].sum()
temp21 =age21_tot['Totals21'].sum()
age21_tot['%age_2021'] = (age21_tot['Totals21']/temp21)*100
age18_tot['%age_2018'] = (age18_tot['Totals18']/temp18)*100

# merge 2018 and 2021
age_total_per= age18_tot.merge(age21_tot, how='outer', left_on=['Age'], right_on=['Age'],).fillna(0).sort_values(by='Age')
age_total_per

In [None]:
age_g21 = age_21.groupby(['Age', 'Gender'])['Age'].count().unstack('Gender').fillna(0).reset_index()
age_g18 = age_18.groupby(['Age', 'Gender'])['Age'].count().unstack('Gender').fillna(0).reset_index()

# for ease of viz, I am clubbing together ['Prefer not to say','Prefer to self-describe','Nonbinary']
age_g21['PnS_Ps_s_Nb'] =age_g21[['Nonbinary','Prefer not to say','Prefer to self-describe',]].sum(axis=1)
age_g18['PnS_Ps_s_Nb'] =age_g18[['Prefer not to say','Prefer to self-describe',]].sum(axis=1)

age_g21.drop(['Nonbinary','Prefer not to say','Prefer to self-describe'], axis=1, inplace=True)
age_g18.drop(['Prefer not to say','Prefer to self-describe'], axis=1, inplace=True)

age_gender = age_g18.merge(age_g21, how='outer', left_on=['Age'], right_on=['Age'],suffixes=('_2018', '_2021')).fillna(0)
age_gender

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2, ax3) = plt.subplots(1, 3,figsize=(16,6))
fig.suptitle('Age distribututions percentages', fontsize=16)

age_gender[['Man_2018','Man_2021']].plot.bar(width=.8, ax = ax1,color=['mediumseagreen','cyan'])
ax1.set_xticklabels(age_gender['Age'])
ax1.set_title('Men')

age_gender[['Woman_2018','Woman_2021']].plot.bar(width=.8, ax = ax2,color=['mediumseagreen','cyan'])
ax2.set_xticklabels(age_gender['Age'])
ax2.set_title('Women')

age_gender[['PnS_Ps_s_Nb_2018','PnS_Ps_s_Nb_2021']].plot.bar(width=.8, ax = ax3,color=['mediumseagreen','cyan'])
ax3.set_xticklabels(age_gender['Age'])


plt.title('Prefer not to say, Prefer to self-describe, Nonbinary', loc='left')



In [None]:
test1= age_gender.sum().reset_index(name= 'abs_tot')
test1.drop(0, inplace=True)

test2 = age_gender.head(3).sum().reset_index(name= 'sub_tot')
test2.drop(0, inplace=True)


In [None]:
merged = test2.merge(test1, how='left', left_on=['Gender'], right_on=['Gender']).fillna(0)
merged['per_cent']= (merged['sub_tot']/merged['abs_tot'])*100

In [None]:
print(' % of the total male Kagglers up to 29 years old in 2018:',round((merged.iloc[0, 3]),2),'%', '\n',
'% of the total male Kagglers up to 29 years old in 2021:',round((merged.iloc[3, 3]),2),'%', '\n',
'\n % of the total female Kagglers up to 29 years old in 2018:',round((merged.iloc[1, 3]),2),'%', '\n',
'% of the total female Kagglers up to 29 years old in 2021:',round((merged.iloc[4, 3]),2),'%', '\n',
'\n % of the total Kagglers that Prefer not to say, selfdescribe, non_binary, up to 29 years old in 2018:',round((merged.iloc[2, 3]),2),'%', '\n',
'% of the total Kagglers that Prefer not to say, selfdescribe, non_binary, up to 29 years old in 2021:',round((merged.iloc[5, 3]),2),'%', '\n',)

The number of women in the age-group up to 29 is proportionally higher than men. This might partially explain the unemployment rate, perhaps the female Kagglers in this group have recently entered the labour market and are looking for work or, as more junior members of the team, they might find themselves more at risk during redundancy rounds. 

**Focus**: Unemployment by Age



In [None]:
import pandas as pd

# dgetting data df for Kagglers Roles 2018
work_all18 = ['Q1','Q2','Q3','Q6']
work_2018_all = pd.read_csv('../input/kaggle-survey-2018/multipleChoiceResponses.csv', usecols=work_all18)
work_2018_all.drop(0, inplace=True) #dropping question text
work_2018_all.columns=['Gender', 'Age', 'Countries', 'Role']
work_2018_all['Role'] = work_2018_all['Role'].replace('Not employed', 'Currently not employed')
work_2018_all_role =work_2018_all.groupby(['Role', 'Gender'])['Gender'].count().unstack('Gender').reset_index().fillna(0)
work_2018_all_role=work_2018_all_role[['Role','Male', 'Female','Prefer not to say','Prefer to self-describe']]
#work_2018_all_role.head()

# getting data df for Kagglers Roles 2021
work_all = ['Q1','Q2','Q3','Q5']
work_2021_all = pd.read_csv('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv', usecols=work_all)
work_2021_all.drop(0, inplace=True) #dropping question text
work_2021_all.columns=['Age','Gender', 'Countries', 'Role']
work_2021_all_role =work_2021_all.groupby(['Role', 'Gender'])['Gender'].count().unstack('Gender').reset_index().fillna(0)
work_2021_all_role=work_2021_all_role[['Role','Man', 'Woman','Nonbinary','Prefer not to say','Prefer to self-describe']]
#work_2021_all_role.head()

In [None]:
# df zooming in on unemployment by Age 2018
unemp_18 = work_2018_all['Role']=='Currently not employed'
unemp_18_age= work_2018_all[unemp_18]
unemp_18_age= unemp_18_age.groupby(['Age', 'Gender'])['Role'].count().unstack('Gender').reset_index().fillna(0)
unemp_18_age['Nbi_Pnt_Psd_2018'] = unemp_18_age['Prefer not to say']+unemp_18_age['Prefer to self-describe']
unemp_18_age.drop(['Prefer not to say','Prefer to self-describe'], axis=1, inplace=True)
unemp_18_age.head()

In [None]:
# df zooming in on unemployment by Age 2021
unemp_21 =work_2021_all['Role']=='Currently not employed'
unemp_21_age = work_2021_all[unemp_21]
unemp_21_age= unemp_21_age.groupby(['Age', 'Gender'])['Role'].count().unstack('Gender').reset_index().fillna(0)

unemp_21_age['Nbi_Pnt_Psd_2018'] = unemp_21_age['Nonbinary']+unemp_21_age['Prefer not to say']+unemp_21_age['Prefer to self-describe']
unemp_21_age.drop(['Nonbinary','Prefer not to say','Prefer to self-describe'], axis=1, inplace=True)
unemp_21_age.head()

In [None]:
combo_unemp_age = unemp_21_age.merge(unemp_18_age, how='outer', left_on=['Age'], right_on=['Age']).fillna(0)
combo_unemp_age.columns= ['Age', 'Man_2021', 'Woman_2021', 'Nbi_Pnt_Psd_2021', 'Woman_2018', 'Man_2018', 'Nbi_Pnt_Psd_2018']

In [None]:
import matplotlib.pyplot as plt
import numpy as np


fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(16,6)) #,sharey=True


fig.suptitle('Unemployment by Age group and Gender.', fontsize=14)

combo_unemp_age[['Woman_2018','Woman_2021']].plot.bar(width=.8, ax = ax1,color=['mediumseagreen','cyan'])
ax1.set_xticklabels(combo_unemp_age['Age'])
ax1.set_title('Woman 2018 and 2021')

combo_unemp_age[['Man_2018','Man_2021']].plot.bar(width=.8, ax = ax2,color=['mediumseagreen','cyan'])
ax2.set_xticklabels(combo_unemp_age['Age'])
ax2.set_title('Man 2018 and 2021')

combo_unemp_age[['Nbi_Pnt_Psd_2018','Nbi_Pnt_Psd_2021']].plot.bar(width=.8, ax = ax3,color=['mediumseagreen','cyan','cyan','cyan'])
ax3.set_xticklabels(combo_unemp_age['Age'])
ax3.set_title('Woman 2018 and 2021')


The charts seem to confirm the hypothesis that unemployment numbers are concentrated in the  younger cohorts.
However in the 2021 survey, although the unemployment numbers have risen across age and gender, women aged  35+ appear to have been more impacted.

As changes have occured in the unemployment landscape,  changes might have occurred in the 'employment' numbers also. 

The roles available for selection have changed between the 2018 and 2021 survey, therefore I will select only roles that appear in both surveys;  this is should be sufficient to give an intuition of possible changes in the employment landscape.

In [None]:
#Checking the roles 2018
work_2018_all_role['nbi_pnt_psd']=work_2018_all_role['Prefer not to say']+work_2018_all_role['Prefer to self-describe']
work_2018_all_role.drop(['Prefer not to say', 'Prefer to self-describe'], axis=1, inplace=True)

print('Employment options in 2018:\n')
work_2018_all_role

In [None]:
#Checking the roles
work_2021_all_role['nbi_pnt_psd']=work_2021_all_role['Prefer not to say']+work_2021_all_role['Prefer to self-describe']+work_2021_all_role['Nonbinary']
work_2021_all_role.drop(['Prefer not to say', 'Prefer to self-describe', 'Nonbinary'], axis=1, inplace=True)

print('Employment options in 2021:\n')
work_2021_all_role

In [None]:
selected_roles = work_2021_all_role.merge(work_2018_all_role, how='inner',left_on=['Role'],right_on=['Role'], suffixes=('_2021','_2018') )
selected_roles.columns=['Role', 'Man_2021', 'Woman_2021', 'nbi_pnt_psd_2021', 'Man_2018', 'Woman_2018','nbi_pnt_psd_2018']
print(' Roles present in both 2018 and 2021 surveys:')
selected_roles

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2, ax3) = plt.subplots(1,3,figsize=(16,6))
fig.suptitle('Employmnet by roles', fontsize=14)
#plt.ylim([0, 5400])

selected_roles[[ 'Man_2018', 'Man_2021']].plot.bar(width=.8, ax = ax1,color=['mediumseagreen','cyan'] )
ax1.set_xticklabels(selected_roles['Role'])
ax1.set_title('Roles Man')
#plt.legend(loc='bottom right')

selected_roles[['Woman_2018','Woman_2021']].plot.bar(width=.8, ax = ax2,color=['mediumseagreen','cyan'])
ax2.set_xticklabels(selected_roles['Role'])
ax2.set_title('Roles Woman')

selected_roles[['nbi_pnt_psd_2021','nbi_pnt_psd_2018']].plot.bar(width=.8, ax = ax3,color=['mediumseagreen','cyan'])
ax3.set_xticklabels(selected_roles['Role'])
ax3.set_title('Roles N_binary, Prefer_self, Pre_not')


From the above it appears that the more significant changes occurred in the Student and Not_Employed numbers, with a slight dip in 'Data Scientist' numbers.

**Cvid19 possible impact on Kaggle numbers?**

In [None]:
tot_by_pop_c = tot[['Totals_2018','Countries','diff_18_21']]
filt_no18  =tot_by_pop_c['Totals_2018']>0
diff_18_21= tot_by_pop_c[filt_no18]
diff_18_21.head()

In [None]:
removed = tot_by_pop_c[~filt_no18]
print('For balance comparison, the coountires with no entries in the 2018 have been removed:\n', removed)

In [None]:
#Adding in covid number
cov_cols = ['Countries', 'Population_2021', 'Cases']
cov_num = pd.read_csv('../input/additional-data-wiki-22nov/additional_data22Nov_wiki.csv', usecols=cov_cols)
cov_num_tots=diff_18_21.merge(cov_num, how='left', left_on=['Countries'], right_on=['Countries'])

In [None]:
#I am using recorded cases as a loose indicator of the impact of the pandemic in a country
cov_num_tots['cov%pop'] = cov_num_tots['Cases']/cov_num_tots['Population_2021']

# Dropping the Republic of Korea / North Korea because is not reporting  
cov_num_tots.dropna(inplace=True)

cov_num_tots =cov_num_tots.sort_values(by='cov%pop', ascending=False)
cov_num_tots.head()

In [None]:
woman_inc = viz_merge[['Country','tot_incr/dcr_w']]
woman_inc.columns = ['Countries','diff_18_21_W']
cov_num_tots_m_w=( cov_num_tots. merge(woman_inc, how='left', left_on=['Countries'], right_on=['Countries'])).fillna(-1)

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(2, 1,figsize=(19,8))

fig.suptitle('Difference between Kaggle numbers 2018/2021  plotted against Covid19 cases: Overall', fontsize=14)
fig.tight_layout()

cov_num_tots_m_w[['cov%pop','diff_18_21']].plot.bar( width=.6, ax = ax1, color=['pink','green'])
ax1.set_xticklabels([])


cov_num_tots_m_w[['cov%pop','diff_18_21_W']].plot.bar( width=.6, ax = ax2, color=['pink','blue'])
ax2.set_xticklabels(cov_num_tots_m_w['Countries'])
plt.title('Difference between Kaggle numbers 2018/2021 plotted against Covid19 cases: Women', fontsize=14)



In [None]:
print(cov_num_tots_m_w[['cov%pop','diff_18_21']].corr(), '\n\n')
print(cov_num_tots_m_w[['cov%pop','diff_18_21_W']].corr())

In [None]:
#taking out some outliers 

high =  cov_num_tots_m_w['cov%pop'].quantile(.90)
high_k = cov_num_tots_m_w['diff_18_21'].quantile(.90)
high_cov_low_drop = cov_num_tots_m_w.loc[(cov_num_tots_m_w['cov%pop']>=high )& (cov_num_tots_m_w['diff_18_21']>=high_k)]
h_l =high_cov_low_drop['Countries'].to_list()
print('Outliers(upper 10%):\nCountries with high covid19 cases and large increase in overall Kagglers:\n',len(h_l))
low =  cov_num_tots_m_w['cov%pop'].quantile(.10)
low_k = cov_num_tots_m_w['diff_18_21'].quantile(.10)
low_cov_high_drop = cov_num_tots_m_w.loc[(cov_num_tots_m_w['cov%pop']<=low )& (cov_num_tots_m_w['diff_18_21']<=low_k)]
l_h=low_cov_high_drop['Countries'].to_list()
print('Outliers(bottom 10%):\nCountries with low covid19 cases and large drop in overall Kagglers:\n', l_h)

In [None]:
w_high =  cov_num_tots_m_w['cov%pop'].quantile(.90)
w_high_k = cov_num_tots_m_w['diff_18_21_W'].quantile(.90)
w_high_cov_low_drop=cov_num_tots_m_w.loc[(cov_num_tots_m_w['cov%pop']>=w_high )& (cov_num_tots_m_w['diff_18_21_W']>=w_high_k)]
w_h_l =w_high_cov_low_drop['Countries'].to_list()
print('Outliers(upper 10%):\nCountries with high covid19 cases and large increase in female Kagglers:\n',len(w_h_l))
w_low =  cov_num_tots_m_w['cov%pop'].quantile(.10)
w_low_k = cov_num_tots_m_w['diff_18_21_W'].quantile(.10)
w_low_cov_high_drop = cov_num_tots_m_w.loc[(cov_num_tots_m_w['cov%pop']<=w_low)& (cov_num_tots_m_w['diff_18_21_W']<=w_low_k)]
w_l_h=w_low_cov_high_drop['Countries'].to_list()
print('Outliers(bottom 10%):\nCountries with low covid19 cases and large drop in overall Kagglers:\n', w_l_h)


In [None]:
cov_no_outl=  cov_num_tots_m_w[~cov_num_tots_m_w.isin(l_h)].dropna()
corr_all = cov_no_outl[['cov%pop','diff_18_21']].corr()

w_cov_no_outl=  cov_num_tots_m_w[~cov_num_tots_m_w.isin(w_l_h)].dropna()

w_corr = w_cov_no_outl[['cov%pop','diff_18_21_W']].corr()

print('Correlation matrix, Kagglers partecipation change and Covid10 reported cases for all Kgglers:\n\n', corr_all)
print('\nCorrelation matrix, Kagglers partecipation change and Covid10 reported cases for female Kgglers:\n\n', w_corr)

The above seems to suggest that the pandemic has had a 'low to moderate' negative effect on Kaggle participation, the impact overall seems to be more or less equal among all the genders.

It must be noted that the recording of covid19 cases is not uniform across countries and measures taken to limit the spread of the virus, including limiting movement and access (work/academia), and their implementation vary from country to country also. Here, I used the covid19 recorded cases numbers, in order to gain an intuition of how the Pandemic might have affected Kaggle participation in different countries.

**Conclusions:**

I chose to focus on 'change' as a way to explore if diversity is increasing in the field, a discipline needs diversity across age, gender,  and cultural background to prevent 'tunnel vision' and continue thriving.
In order to uncover diversity in the community, I looked for changes in who, where, and what the Kagglers do. The data has confirmed that the Kaggle community has changed: new countries have appeared, female participation has increased overall, students numbers have increased, Kagglers’ employment range is reasonably varied, and although, the unemployment numbers hint at a difficult past two years they also imply that more people are choosing to use Kaggle to retrain or update/upgrade their skill-set during the downtime. Encouragingly Kaggle is attracting people from a wide range of age groups.




# Appendix
<a id='Appendix'></a>

Code used to gather the additional data below (commented out). The resulting dataframes have been saved as Kaggle datasets (public):
 
 https://www.kaggle.com/danielanewton/additional-data-wiki-22nov 
 
 https://www.kaggle.com/danielanewton/survey-totals-merged-data

Kaggles Survey Sources:

../input/kaggle-survey-2018

../input/kaggle-survey-2019

../input/kaggle-survey-2020

../input/kaggle-survey-2021

Additional data source:

https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)

https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population

https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory#wrapper65150380

https://en.wikipedia.org/wiki/List_of_countries_by_past_fertility_rate

https://en.wikipedia.org/wiki/Urbanization_by_country




In [None]:
# #Getting the data
# import pandas as pd

# # def to read the total respondents for years 2018, 2019, 2020,2021 surveys
# def make_df(urly):
#   usecols = ['Q3']
#   data = pd.read_csv(urly, usecols=usecols)
#   data.drop(0,inplace=True)
#   data =data['Q3'].value_counts().reset_index()
#   data.columns=['Countries', 'Totals']
#   return data

In [None]:
# # Make df
# d_2018 = make_df('../input/kaggle-survey-2018/multipleChoiceResponses.csv')
# d_2019 = make_df('../input/kaggle-survey-2019/multiple_choice_responses.csv')
# d_2020 = make_df('../input/kaggle-survey-2020/kaggle_survey_2020_responses.csv')
# d_2021 = make_df('../input/kaggle-survey-2021/kaggle_survey_2021_responses.csv')


In [None]:
# tot_18_to_21 = d_2018.merge(d_2019,  how="outer", left_on=['Countries'], right_on=['Countries'],suffixes=('_2018', '_2019')).fillna(0)
# tot_18_to_21 = tot_18_to_21.merge(d_2020,  how="outer", left_on=['Countries'], right_on=['Countries']).fillna(0)
# tot_18_to_21 = tot_18_to_21.rename(columns={'Totals':'Totals_2020'})
# tot_18_to_21 =tot_18_to_21.merge(d_2021,  how="outer", left_on=['Countries'], right_on=['Countries']).fillna(0)
# tot_18_to_21 = tot_18_to_21.rename(columns={'Totals':'Totals_2021'})
# tot_18_to_21.head()

In [None]:
# tot_18_to_21.to_csv('totals_by_country_18_20.csv', index=False) 

# # loaded to datasets as 'survey_totals_merged_data'

In [None]:
# # merge df totals by country for years 2018 and 2021 surveys
# tot_18_21 = d_2018.merge(d_2021,  how="outer", left_on=['Countries'], right_on=['Countries'],suffixes=('_2018', '_2021')).fillna(0)

In [None]:
# #Retrieve population data for 2018 2019
# import pandas as pd

# link = "https://en.wikipedia.org/wiki/List_of_countries_by_population_(United_Nations)"
# tables = pd.read_html(link)
# data = tables[0]

# data= data[['Country/Area', 'UN statisticalsubregion[4]', 'Population(1 July 2018)']]
# data.columns=['Countries','UN statisticalsubregion','Population_2018']
# data['Countries'] = data['Countries'].str.replace(r'\[.*?\]','')
# data['Countries'] = data['Countries'].str.strip()

# #Adjust country names to match Kaggle
# data['Countries'] = data['Countries'].replace('United States', 'United States of America')
# data['Countries'] = data['Countries'].replace('United Kingdom', 'United Kingdom of Great Britain and Northern Ireland')
# data['Countries'] = data['Countries'].replace('Iran', 'Iran, Islamic Republic of...')
# data['Countries'] = data['Countries'].replace('Vietnam', 'Viet Nam')
# data['Countries'] = data['Countries'].replace('Hong Kong (China)','Hong Kong (S.A.R.)')
# data['Countries'] = data['Countries'].replace('North Korea','Republic of Korea')


# link2021 = "https://en.wikipedia.org/wiki/List_of_countries_and_dependencies_by_population"

# tables21 = pd.read_html(link2021)

# data21 = tables21[0].iloc[1:]

# #Clean up columns
# data21 = data21[['Country or dependent territory', 'Population', ]]
# data21.columns=['Countries','Population_2021']
# data21['Countries'] = data21['Countries'].str.replace(r'\[.*?\]','')
# data21['Countries'] = data21['Countries'].str.strip()

# #Adjust country names to match Kaggle
# data21['Countries'] = data21['Countries'].replace('United States', 'United States of America')
# data21['Countries'] = data21['Countries'].replace('United Kingdom', 'United Kingdom of Great Britain and Northern Ireland')
# data21['Countries'] = data21['Countries'].replace('Iran', 'Iran, Islamic Republic of...')
# data21['Countries'] = data21['Countries'].replace('Vietnam', 'Viet Nam')
# data21['Countries'] = data21['Countries'].replace('Hong Kong (China)','Hong Kong (S.A.R.)')
# data21['Countries'] = data21['Countries'].replace('North Korea','Republic of Korea')

# #Add data to Kaggle survey totals for 2018 and 2021
# pop_data_2018 = tot_18_21.merge(data, how='left', left_on=['Countries'], right_on=['Countries'])
# pop_data_21_18 = pop_data_2018.merge(data21, how='left', left_on=['Countries'], right_on=['Countries'])

In [None]:
# #Covid cases totals 

# import pandas as pd

# urli = 'https://en.wikipedia.org/wiki/COVID-19_pandemic_by_country_and_territory#wrapper65150380'

# covid = pd.read_html(urli)
# covid21 = covid[9].iloc[1:-1]
# covid_cases= covid21[['Country.1','Cases']]
# covid_cases['Country.1']= covid_cases['Country.1'].str.replace(r'\[.*?\]','')
# covid_cases['Country.1'] = covid_cases['Country.1'].str.strip()

# covid_cases = covid_cases.rename(columns={'Country.1':'Countries'})

# covid_cases['Countries'] = covid_cases['Countries'].replace('United States', 'United States of America')
# covid_cases['Countries'] = covid_cases['Countries'].replace('United Kingdom', 'United Kingdom of Great Britain and Northern Ireland')
# covid_cases['Countries'] = covid_cases['Countries'].replace('Iran', 'Iran, Islamic Republic of...')
# covid_cases['Countries'] = covid_cases['Countries'].replace('Vietnam', 'Viet Nam')
# covid_cases['Countries'] = covid_cases['Countries'].replace('Hong Kong', 'Hong Kong (S.A.R.)')
# covid_cases['Countries'] = covid_cases['Countries'].replace('Republic of Ireland', 'Ireland')


# pop_data_21_18 = pop_data_21_18.merge( covid_cases, how='left', left_on=['Countries'], right_on=['Countries'])

In [None]:
# # Population fertility

# fertility_rate = 'https://en.wikipedia.org/wiki/List_of_countries_by_past_fertility_rate'

# tables_fertility = pd.read_html(fertility_rate )

# fertility = tables_fertility[1].iloc[:]
# fertility.drop(columns=['1950–1955', '1955–1960', '1960–1965',
#        '1965–1970', '1970–1975', '1975–1980', '1980–1985', '1985–1990',
#        '1990–1995', '1995–2000'], inplace=True)

# fertility['Country/dependent territory']= fertility['Country/dependent territory'].str.replace(r'\[.*?\]','')
# fertility['Country/dependent territory'] = fertility['Country/dependent territory'].str.strip()

# fertility = fertility.rename(columns={'Country/dependent territory':'Countries'})

# #adjust countries names to match Kaggle survey
# fertility['Countries'] = fertility['Countries'].replace('United States', 'United States of America')
# fertility['Countries'] = fertility['Countries'].replace('United Kingdom', 'United Kingdom of Great Britain and Northern Ireland')
# fertility['Countries'] = fertility['Countries'].replace('Iran', 'Iran, Islamic Republic of...')
# fertility['Countries'] = fertility['Countries'].replace('Vietnam', 'Viet Nam')
# fertility['Countries'] = fertility['Countries'].replace('Hong Kong (China)', 'Hong Kong (S.A.R.)')
# fertility['Countries'] = fertility['Countries'].replace('Republic of Ireland', 'Ireland')
# fertility['Countries'] = fertility['Countries'].replace('North Korea','Republic of Korea')

# pop_data_21_18 = pop_data_21_18.merge(fertility, how='left', left_on=['Countries'], right_on=['Countries'])

In [None]:
# # Urbanization data
# import pandas as pd

# urb = 'https://en.wikipedia.org/wiki/Urbanization_by_country'

# urban_table = pd.read_html(urb)
# urban = urban_table[0].iloc[:]

# urban['Nation']= urban['Nation'].str.replace(r'\[.*?\]','')
# urban['Nation']= urban['Nation'].str.strip()
# urban['Urbanization Rate (%)']=urban['Urbanization Rate (%)'].str.replace(r'\[.*?\]','')
# urban['Urbanization Rate (%)']=urban['Urbanization Rate (%)'].str.strip()
# urban= urban.rename(columns={'Urbanization Rate (%)':'Urbanization Rate_(%)_2015–20_est.','Nation':'Countries' })
# urban.drop(columns=['Rank', 'Period'], inplace=True)

# #adjust countries names to match Kaggle survey
# urban['Countries'] = urban['Countries'].replace('United States', 'United States of America')
# urban['Countries'] = urban['Countries'].replace('United Kingdom', 'United Kingdom of Great Britain and Northern Ireland')
# urban['Countries'] = urban['Countries'].replace('Iran', 'Iran, Islamic Republic of...')
# urban['Countries'] = urban['Countries'].replace('Vietnam', 'Viet Nam')
# urban['Countries'] = urban['Countries'].replace('Hong Kong (China)', 'Hong Kong (S.A.R.)')
# urban['Countries'] = urban['Countries'].replace('Republic of Ireland', 'Ireland')
# urban['Countries'] = urban['Countries'].replace('North Korea','Republic of Korea')

# pop_data_21_18 = pop_data_21_18.merge(urban, how='left', left_on=['Countries'], right_on=['Countries'])

In [None]:
#pop_data_21_18.to_csv('additional_data22Nov_wiki.csv', index=False)

# # loaded to datasets as 'additional_data_wiki_22Nov'