In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns

df = pd.read_csv("../input/survey_results_public.csv")
df.head()

In [None]:
# Overview: Number of rows and columns
df.shape

# Differences in Salary based on Gender
Business question: How does the salary differ between genders based on years of coding experience?

In [None]:
# Take a look at the possible answers for 'Gender'
df['Gender'].value_counts()

In [None]:
# Mean Salary for each gender
df.groupby('Gender').mean()['Salary']

In [None]:
# Missing values for 'Gender'
print(np.sum(df['Gender'].isnull() == 1))
# Proportion of missing values for 'Gender'
print(df['Gender'].isnull().mean())

In [None]:
# 32% of participants did not make any entries for 'Gender'. Drop entries without answer for 'Gender'
df = df.dropna(subset=['Gender'], axis=0)
print(df.shape)

In [None]:
# For simplification purposes, in this task we only want to differentiate between male, female and other.
# Of course, in a more complex study, all genders should be respected.

# Create individual dataframes for male, female, other
df_female = df[df['Gender'] == 'Female']
df_male = df[df['Gender'] == 'Male']

df_other = df[df['Gender'] != 'Male'] 
df_other = df_other[df_other['Gender'] != 'Female'] 

# Check number of entries for each dataframe
print(df_female.shape[0])
print(df_male.shape[0])
print(df_other.shape[0])
print(df_female.shape[0] + df_male.shape[0] + df_other.shape[0])

In [None]:
# Display mean salary grouped by years of coding experience for female participants
df_female.groupby('YearsProgram').mean()['Salary']


In [None]:
# Fill the NaN answers with the average salary of df_female and display again.
df_female = df_female.fillna(df_female.mean())
df_female.groupby('YearsProgram').mean()['Salary']

In [None]:
# Create an extra column 'sort' to sort years of coding experience properly
df_female_salary = df_female.groupby('YearsProgram').mean()['Salary'].reset_index()
df_female_salary['sort']=[2, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 3, 21, 4, 5, 6, 7, 8, 9, 10, 1]
df_female_salary.sort_values(by='sort', ascending=True).head(100)

In [None]:
# Visualize results as a bar chart
df_female_salary.sort_values(by='sort', ascending=True).plot('YearsProgram', 'Salary', color='blue', kind="bar");

In [None]:
# Repeat the same approach for the male group
df_male.groupby('YearsProgram').mean()['Salary']

In [None]:
# Fill the NaN answers with the average salary of df_male and display again.
df_male = df_male.fillna(df_male.mean())
df_male.groupby('YearsProgram').mean()['Salary']

In [None]:
# Create an extra column 'sort' to sort years of coding experience properly
df_male_salary = df_male.groupby('YearsProgram').mean()['Salary'].reset_index()
df_male_salary['sort']=[2, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 3, 21, 4, 5, 6, 7, 8, 9, 10, 1]
df_male_salary.sort_values(by='sort', ascending=True).head(100)

In [None]:
# Visualize results as a bar chart
df_male_salary.sort_values(by='sort', ascending=True).plot('YearsProgram', 'Salary', color='blue', kind="bar");

In [None]:
# Same approach for the other group
df_other.groupby('YearsProgram').mean()['Salary']

In [None]:
# Fill the NaN answers with the average salary of df_other and display again.
df_other = df_other.fillna(df_other.mean())
df_other.groupby('YearsProgram').mean()['Salary']

In [None]:
# Create an extra column 'sort' to sort years of coding experience properly
df_other_salary = df_other.groupby('YearsProgram').mean()['Salary'].reset_index()
df_other_salary['sort']=[2, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 3, 21, 4, 5, 6, 7, 8, 9, 10, 1]
df_other_salary.sort_values(by='sort', ascending=True).head(100)

In [None]:
# Visualize results as a bar chart
df_other_salary.sort_values(by='sort', ascending=True).plot('YearsProgram', 'Salary', color='blue', kind="bar");

In [None]:
# Combine all results into new dataframe
df_salary = df_female_salary
df_salary.rename(columns={'Salary': 'Female'}, inplace=True)
df_salary['Male'] = df_male_salary['Salary']
df_salary['Other'] = df_other_salary['Salary']
df_salary.sort_values(by='sort', ascending=True).head(100)

In [None]:
# Visualize the data
df_salary.sort_values(by='sort', ascending=True).plot('YearsProgram', ['Male', 'Female', 'Other'], xticks=range(len(df_salary.index)), 
                                                      rot=60, kind="line", figsize=(15,5));
plt.title('Salary based on Coding Experience', fontsize=15, color='black');

## Findings
* progression based on years of experience in salary
* others all over the place
* male average not higher than female average
* could be interpreted that female (fewer number) are more qualified
* ... warrants further research