In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, log_loss

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_theme(style="whitegrid")

In [18]:
df_teaching = pd.read_excel('Teaching staff.xlsx', sheet_name='Data',header=3, na_values=['...', '…'])
df_women_legislators = pd.read_excel('Women legislators and managers.xlsx', sheet_name='Data', header=3, na_values=['...', '…'])
df_legal_age = pd.read_excel('Legal Age for Marriage.xlsx', sheet_name='Data', header=3, na_values=['...', '…'])
df_marriages = pd.read_excel('Marriages.xlsx', sheet_name='Data', header=5, na_values=['...', '…'])
df_part_time = pd.read_excel('Part-time employment.xlsx', sheet_name='Data', header=5, na_values=['...', '…'])

In [19]:
df_teaching.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country or area  205 non-null    object 
 1   Year             195 non-null    float64
 2   Unnamed: 2       20 non-null     object 
 3   %                195 non-null    float64
 4   Unnamed: 4       0 non-null      float64
 5   Year.1           181 non-null    float64
 6   Unnamed: 6       37 non-null     object 
 7   %.1              181 non-null    float64
 8   Unnamed: 8       0 non-null      float64
 9   Year.2           163 non-null    float64
 10  Unnamed: 10      26 non-null     object 
 11  %.2              163 non-null    float64
dtypes: float64(8), object(4)
memory usage: 19.3+ KB


In [20]:
df_marriages = df_marriages.drop('Unnamed: 4', axis=1)
df_marriages = df_marriages.drop('Unnamed: 7', axis=1)

In [None]:
df_marriages.columns.values[0] = 'Country'
df_marriages.columns.values[2] = 'population women 15-19 ever married (%)'
df_marriages.columns.values[3] = 'population men 15-19 ever married (%)'
df_marriages.columns.values[4] = 'women average single age before marriage (year)'
df_marriages.columns.values[5] = 'men average single age before marriage (year)'
df_marriages.columns.values[6] = 'Source'
df_marriages.head()

Unnamed: 0,Country,Year,population women 15-19 ever married (%),population men 15-19 ever married (%),women average single age before marriage (year),men average single age before marriage (year),Source
0,Afghanistan,2010,17.3,,21.5,,DHS
1,Albania,2011,7.2,2.0,25.1,29.2,NS
2,Algeria,2002,1.8,0.2,29.5,33.0,PAPFAM
3,American Samoa,2000,21.4,,25.7,,USCB
4,Angola,1970,35.7,7.6,19.4,24.5,USCB


In [22]:
df_women_legislators.columns.values[0] = 'Country'
df_women_legislators.columns.values[3] = 'Women legislators (%)'
df_women_legislators.columns.values[2] = 'Type'
df_women_legislators.head()

Unnamed: 0,Country,Year,Type,Women legislators (%),Source
0,Algeria,2004,,4.870624,LFS
1,Anguilla,2001,,52.03252,PC
2,Antigua and Barbuda,2001,,45.059786,PC
3,Argentina,2006,"a,b",23.120202,LFS
4,Armenia,2001,,23.939559,PC


In [23]:
df_legal_age = df_legal_age.drop("With parental consent ",axis=1)
df_legal_age = df_legal_age.drop("Unnamed: 10",axis=1)

In [24]:
df_legal_age.columns.values[0] = 'Country'
df_legal_age.columns.values[1] = 'women w/o parental consent minimum age'
df_legal_age.columns.values[2] = 'Type'
df_legal_age.columns.values[3] = 'Men w/o parental consent minimum age'
df_legal_age.columns.values[4] = 'Type'
df_legal_age.columns.values[5] = 'women with parental consent minimum age'
df_legal_age.columns.values[6] = 'Type'
df_legal_age.columns.values[7] = 'Men with parental consent minimum age'
df_legal_age.columns.values[8] = 'Type'
df_legal_age.columns.values[9] = 'Year'
df_legal_age.head()

Unnamed: 0,Country,women w/o parental consent minimum age,Type,Men w/o parental consent minimum age,Type.1,women with parental consent minimum age,Type.2,Men with parental consent minimum age,Type.3,Year,Source
0,,women,,men,,women,,men,,year,
1,Afghanistan,16,,18,,15,,,,2011,UNPD
2,Aland Islands1,18,,18,,,,,,2011,UNSD
3,Albania,,,,,16,,18,,2011,UNSD
4,Algeria,19,,19,,,,,,2012,CEDAW (51st session)


In [25]:
df_part_time = df_part_time.drop('Unnamed: 2', axis=1)
df_part_time.columns.values[0] = 'Country'
df_part_time.columns.values[2] = 'Type'
df_part_time.columns.values[3] = 'Female part-time (%)'
df_part_time.columns.values[4] = 'Male part-time (%)'
df_part_time.columns.values[5] = 'Female part-time/Total part-time'

In [30]:
df_part_time.head()

Unnamed: 0,Country,Year,Type,Female part-time (%),Male part-time (%),Female part-time/Total part-time,Source
0,Albania,2001,"a,b,c,d",50.200001,46.099998,39.700001,O
1,Argentina,2011,"e,f,g,h",31.299999,11.9,64.800003,LFS
2,Armenia,2008,"b,i,j",30.1,19.0,56.200001,LFS
3,Aruba,1994,"e,k,j",12.3,4.0,71.0,HS
4,Australia,2011,"l,b,f,m,n",38.5,13.2,70.900002,LFS


In [27]:
df_teaching.columns.values[0] = 'Country'
df_teaching.columns.values[3] = 'Female Teachers Primary (%)'
df_teaching.columns.values[7] = 'Female Teachers Secondary (%)'
df_teaching.columns.values[11] = 'Female Teachers Tertiary (%)'

In [28]:
df_teaching = df_teaching.drop('Unnamed: 2', axis=1)
df_teaching = df_teaching.drop('Unnamed: 4', axis=1)
df_teaching = df_teaching.drop('Unnamed: 6', axis=1)
df_teaching = df_teaching.drop('Unnamed: 8', axis=1)
df_teaching = df_teaching.drop('Unnamed: 10', axis=1)