# **Analysing Gender Pay Gap**

# *Importing Libraries*

In [1]:
import numpy as np # linear algebra
import pandas as pd
import plotly.graph_objs as go

%matplotlib inline
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.offline as py
import seaborn as sns

import pandas_profiling

import random
plt.style.use("fivethirtyeight")

import warnings
warnings.filterwarnings('ignore')

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/glassdoor-analyze-gender-pay-gap/Glassdoor Gender Pay Gap.csv


# *Loading Dataset*

In [2]:
df = pd.read_csv('/kaggle/input/glassdoor-analyze-gender-pay-gap/Glassdoor Gender Pay Gap.csv')

# *Basic Data Analysis*

In [3]:
df.profile_report()

Summarize dataset:   0%|          | 0/22 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [4]:
df.head()

Unnamed: 0,JobTitle,Gender,Age,PerfEval,Education,Dept,Seniority,BasePay,Bonus
0,Graphic Designer,Female,18,5,College,Operations,2,42363,9938
1,Software Engineer,Male,21,5,College,Management,5,108476,11128
2,Warehouse Associate,Female,19,4,PhD,Administration,5,90208,9268
3,Software Engineer,Male,20,5,Masters,Sales,4,108080,10154
4,Graphic Designer,Male,26,5,Masters,Engineering,5,99464,9319


In [5]:
df.describe()

Unnamed: 0,Age,PerfEval,Seniority,BasePay,Bonus
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,41.393,3.037,2.971,94472.653,6467.161
std,14.294856,1.423959,1.395029,25337.493272,2004.377365
min,18.0,1.0,1.0,34208.0,1703.0
25%,29.0,2.0,2.0,76850.25,4849.5
50%,41.0,3.0,3.0,93327.5,6507.0
75%,54.25,4.0,4.0,111558.0,8026.0
max,65.0,5.0,5.0,179726.0,11293.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   JobTitle   1000 non-null   object
 1   Gender     1000 non-null   object
 2   Age        1000 non-null   int64 
 3   PerfEval   1000 non-null   int64 
 4   Education  1000 non-null   object
 5   Dept       1000 non-null   object
 6   Seniority  1000 non-null   int64 
 7   BasePay    1000 non-null   int64 
 8   Bonus      1000 non-null   int64 
dtypes: int64(5), object(4)
memory usage: 70.4+ KB


In [7]:
df['TotalPay'] = df['BasePay'] + df['Bonus']

In [8]:
print('Job Titles: ')
df.JobTitle.value_counts()

Job Titles: 


Marketing Associate    118
Software Engineer      109
Financial Analyst      107
Data Scientist         107
Graphic Designer        98
IT                      96
Sales Associate         94
Driver                  91
Warehouse Associate     90
Manager                 90
Name: JobTitle, dtype: int64

In [9]:
print('Education Level: ')
df.Education.value_counts()

Education Level: 


High School    265
Masters        256
College        241
PhD            238
Name: Education, dtype: int64

# *Visualisation*

In [10]:
gender = df.groupby('Gender').count()

fig = go.Figure(data=[go.Bar(
            x = gender.index,
            y = gender['JobTitle'],
            #text=y,
            width=0.3,
            textposition='auto',
            marker=dict()
 )])

fig.data[0].marker.line.width = 1
fig.data[0].marker.line.color = "black"
fig.update_layout(yaxis=dict(title=''),width=500,height=500,
                  title= 'No of Male and Female Job Entries on the Dataset',
                  xaxis=dict(title='Gender'))
fig.show()

In [11]:
gender_dept = df.groupby(['Dept','Gender']).size().reset_index(name='counts')
fig = px.bar(gender_dept, x='Dept',y='counts',color='Gender',barmode='group', title ='Count Gender per Department')
fig.show()

In [12]:
title = pd.get_dummies(df, columns=['Gender']).groupby('Dept').sum()

female = go.Pie(labels=title.index,values=title['Gender_Female'],name="Female",hole=0.5,domain={'x': [0,0.46]})
male = go.Pie(labels=title.index,values=title['Gender_Male'],name="Male",hole=0.5,domain={'x': [0.52,1]})

layout = dict(title = 'Department Distribution', font=dict(size=14), legend=dict(orientation="h"),
              annotations = [dict(x=0.2, y=0.5, text='Female', showarrow=False, font=dict(size=20)),
                             dict(x=0.8, y=0.5, text='Male', showarrow=False, font=dict(size=20)) ])

fig = dict(data=[female, male], layout=layout)
py.iplot(fig)

In [13]:
gender_job = df.groupby(['JobTitle','Gender']).size().reset_index(name='counts')
fig = px.bar(gender_job, x='JobTitle',y='counts',color='Gender',barmode='group', title ='Count Gender per JobTitle')
fig.show()

In [14]:
title = pd.get_dummies(df, columns=['Gender']).groupby('JobTitle').sum()

female = go.Pie(labels=title.index,values=title['Gender_Female'],name="Female",hole=0.5,domain={'x': [0,0.46]})
male = go.Pie(labels=title.index,values=title['Gender_Male'],name="Male",hole=0.5,domain={'x': [0.52,1]})

layout = dict(title = 'Job Title Distribution', font=dict(size=14), legend=dict(orientation="h"),
              annotations = [dict(x=0.2, y=0.5, text='Female', showarrow=False, font=dict(size=20)),
                             dict(x=0.8, y=0.5, text='Male', showarrow=False, font=dict(size=20)) ])

fig = dict(data=[female, male], layout=layout)
py.iplot(fig)