In [229]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree

from sklearn.metrics import r2_score, mean_squared_error, accuracy_score, log_loss

import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
sns.set_theme(style="whitegrid")

In [230]:
df_teaching = pd.read_excel('Teaching staff.xlsx', sheet_name='Data',header=3, na_values=['...', '…'])
df_women_legislators = pd.read_excel('Women legislators and managers.xlsx', sheet_name='Data', header=3, na_values=['...', '…'])
df_legal_age = pd.read_excel('Legal Age for Marriage.xlsx', sheet_name='Data', header=3, na_values=['...', '…'])
df_marriages = pd.read_excel('Marriages.xlsx', sheet_name='Data', header=5, na_values=['...', '…'])
df_part_time = pd.read_excel('Part-time employment.xlsx', sheet_name='Data', header=5, na_values=['...', '…'])

In [231]:
df_teaching.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Country or area  205 non-null    object 
 1   Year             195 non-null    float64
 2   Unnamed: 2       20 non-null     object 
 3   %                195 non-null    float64
 4   Unnamed: 4       0 non-null      float64
 5   Year.1           181 non-null    float64
 6   Unnamed: 6       37 non-null     object 
 7   %.1              181 non-null    float64
 8   Unnamed: 8       0 non-null      float64
 9   Year.2           163 non-null    float64
 10  Unnamed: 10      26 non-null     object 
 11  %.2              163 non-null    float64
dtypes: float64(8), object(4)
memory usage: 19.3+ KB


In [232]:
df_marriages = df_marriages.drop('Unnamed: 4', axis=1)
df_marriages = df_marriages.drop('Unnamed: 7', axis=1)

In [233]:
df_marriages.columns.values[0] = 'Country'
df_marriages.columns.values[2] = 'population women 15-19 ever married (%)'
df_marriages.columns.values[3] = 'population men 15-19 ever married (%)'
df_marriages.columns.values[4] = 'women average single age before marriage (year)'
df_marriages.columns.values[5] = 'men average single age before marriage (year)'
df_marriages.columns.values[6] = 'Source'
df_marriages.head()

Unnamed: 0,Country,Year,population women 15-19 ever married (%),population men 15-19 ever married (%),women average single age before marriage (year),men average single age before marriage (year),Source
0,Afghanistan,2010,17.3,,21.5,,DHS
1,Albania,2011,7.2,2.0,25.1,29.2,NS
2,Algeria,2002,1.8,0.2,29.5,33.0,PAPFAM
3,American Samoa,2000,21.4,,25.7,,USCB
4,Angola,1970,35.7,7.6,19.4,24.5,USCB


In [234]:
df_women_legislators.columns.values[0] = 'Country'
df_women_legislators.columns.values[3] = 'Women legislators (%)'
df_women_legislators.columns.values[2] = 'Type'
df_women_legislators.head()

Unnamed: 0,Country,Year,Type,Women legislators (%),Source
0,Algeria,2004,,4.870624,LFS
1,Anguilla,2001,,52.03252,PC
2,Antigua and Barbuda,2001,,45.059786,PC
3,Argentina,2006,"a,b",23.120202,LFS
4,Armenia,2001,,23.939559,PC


In [235]:
df_legal_age = df_legal_age.drop("With parental consent ",axis=1)
df_legal_age = df_legal_age.drop("Unnamed: 10",axis=1)

In [236]:
df_legal_age.columns.values[0] = 'Country'
df_legal_age.columns.values[1] = 'women w/o parental consent minimum age'
df_legal_age.columns.values[2] = 'Type'
df_legal_age.columns.values[3] = 'Men w/o parental consent minimum age'
df_legal_age.columns.values[4] = 'Type'
df_legal_age.columns.values[5] = 'women with parental consent minimum age'
df_legal_age.columns.values[6] = 'Type'
df_legal_age.columns.values[7] = 'Men with parental consent minimum age'
df_legal_age.columns.values[8] = 'Type'
df_legal_age.columns.values[9] = 'Year'
df_legal_age.head()
# Drop first row if Country is NaN
if pd.isna(df_legal_age.iloc[0]['Country']):
    df_legal_age = df_legal_age.drop(0).reset_index(drop=True)
df_legal_age.head()
# in the man and women parental consent minimum age columns, replace values like <18 to 18 and lower it by one
# if its like 9-15, just take the lower value
def clean_age(age):
    if isinstance(age, str) and age.startswith('<'):
        return float(age[1:]) - 1
    if isinstance(age, str) and '-' in age:
        parts = age.split('-')
        try:
            return float(parts[0].strip())
        except ValueError:
            return np.nan
    return float(age)   

df_legal_age['women with parental consent minimum age'] = df_legal_age['women with parental consent minimum age'].apply(clean_age)
df_legal_age['Men with parental consent minimum age'] = df_legal_age['Men with parental consent minimum age'].apply(clean_age)
# there are values in the women w/o parental consent minimum age and men w/o parental consent minimum age columns like '12.5 and 19' or '12.5-18'
# if it doesnt need cleaning, just leave it at that
def clean_age_range(age):
    if isinstance(age, str):
        if 'and' in age:
            parts = age.split('and')
            try:
                return float(parts[0].strip())
            except ValueError:
                return np.nan
        elif '-' in age:
            parts = age.split('-')
            try:
                return float(parts[0].strip())
            except ValueError:
                return np.nan
    return age
df_legal_age['women w/o parental consent minimum age'] = df_legal_age['women w/o parental consent minimum age'].apply(clean_age_range)
df_legal_age['Men w/o parental consent minimum age'] = df_legal_age['Men w/o parental consent minimum age'].apply(clean_age_range)
# There are country names like "Slovakia13", remove the numbers from the names
df_legal_age['Country'] = df_legal_age['Country'].str.replace(r'\d+', '', regex=True).str.strip()
df_legal_age

Unnamed: 0,Country,women w/o parental consent minimum age,Type,Men w/o parental consent minimum age,Type.1,women with parental consent minimum age,Type.2,Men with parental consent minimum age,Type.3,Year,Source
0,Afghanistan,16.0,,18.0,,15.0,,,,2011,UNPD
1,Aland Islands,18.0,,18.0,,,,,,2011,UNSD
2,Albania,,,,,16.0,,18.0,,2011,UNSD
3,Algeria,19.0,,19.0,,,,,,2012,CEDAW (51st session)
4,Andorra,16.0,,16.0,,14.0,,14.0,,2001,CEDAW (25th session)
...,...,...,...,...,...,...,...,...,...,...,...
205,Venezuela,12.0,,12.0,,,,,,2011,UNSD
206,Vietnam,18.0,,20.0,,,,,,2007,CEDAW (37th session)
207,Yemen,15.0,u,15.0,u,,,,,2002,CEDAW (exceptional session 2002)
208,Zambia,21.0,,21.0,,,,,,2011,CEDAW (49th session)


In [244]:
# What are the countries with the largest minimum marriage ages gaps? 
# Calculate the gap between men and women without parental consent
df_legal_age['age_gap_no_consent'] = df_legal_age['Men w/o parental consent minimum age'] - df_legal_age['women w/o parental consent minimum age']
df_legal_age[['Country', 'age_gap_no_consent']].sort_values(by='age_gap_no_consent', ascending=False).head(10)
# Calculate the gap between men and women with parental consent
df_legal_age['age_gap_with_consent'] = df_legal_age['Men with parental consent minimum age'] - df_legal_age['women with parental consent minimum age']
df_legal_age[['Country', 'age_gap_with_consent']].sort_values(by='age_gap_with_consent', ascending=False).head(10)

Unnamed: 0,Country,age_gap_with_consent
105,Lebanon,4.0
10,Aruba,3.0
31,Burundi,3.0
45,Congo,3.0
88,Iran,3.0
30,Burkina Faso,3.0
82,Haiti,3.0
141,Niue,3.0
123,Monaco,3.0
69,Gabon,3.0


In [238]:
df_part_time = df_part_time.drop('Unnamed: 2', axis=1)
df_part_time.columns.values[0] = 'Country'
df_part_time.columns.values[2] = 'Type'
df_part_time.columns.values[3] = 'Female part-time (%)'
df_part_time.columns.values[4] = 'Male part-time (%)'
df_part_time.columns.values[5] = 'Female part-time/Total part-time'

In [239]:
df_part_time.head()

Unnamed: 0,Country,Year,Type,Female part-time (%),Male part-time (%),Female part-time/Total part-time,Source
0,Albania,2001,"a,b,c,d",50.200001,46.099998,39.700001,O
1,Argentina,2011,"e,f,g,h",31.299999,11.9,64.800003,LFS
2,Armenia,2008,"b,i,j",30.1,19.0,56.200001,LFS
3,Aruba,1994,"e,k,j",12.3,4.0,71.0,HS
4,Australia,2011,"l,b,f,m,n",38.5,13.2,70.900002,LFS


In [240]:
df_teaching.columns.values[0] = 'Country'
df_teaching.columns.values[3] = 'Female Teachers Primary (%)'
df_teaching.columns.values[7] = 'Female Teachers Secondary (%)'
df_teaching.columns.values[11] = 'Female Teachers Tertiary (%)'

In [241]:
df_teaching = df_teaching.drop('Unnamed: 2', axis=1)
df_teaching = df_teaching.drop('Unnamed: 4', axis=1)
df_teaching = df_teaching.drop('Unnamed: 6', axis=1)
df_teaching = df_teaching.drop('Unnamed: 8', axis=1)
df_teaching = df_teaching.drop('Unnamed: 10', axis=1)

In [242]:
# Can we guess 