In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
%matplotlib inline

# from sklearn.linear_model import LinearRegression
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import r2_score, mean_squared_error
import seaborn as sns

df = pd.read_csv("../input/survey_results_public.csv")
df.head()

# Question 3: Popularity of job portals
Business Question: As a German company, on which job portals can we find qualified developers? Which job portals have been the most popular in the survey?


In [None]:
# Get a first overview of the DataFrame and the relevant column 'JobProfile'.

# Shape of the DataFrame
print(df.shape)

# Missing values for 'JobProfile'
print(np.sum(df['JobProfile'].isnull() == 1))

# Proportion of missing values for JobProfile
print(df['JobProfile'].isnull().mean())


In [None]:
# List all possible answers in a new DataFrame

# Note that for our purposes (finding the top n job portals), dropping NaN values for 'JobProfile' is not relevant.
# df['JobProfile'].value_counts(dropna=False)

df_jobprofile = df['JobProfile'].value_counts().reset_index()
df_jobprofile.rename(columns={'index': 'JobPortal', 'JobProfile': 'count'}, inplace=True)
df_jobprofile.head(10)

In [None]:
# Define function total_count to take apart multiple answers
def total_count(df, col1, col2, look_for):
    from collections import defaultdict
    new_df = defaultdict(int)
    for val in look_for:
        for idx in range(df.shape[0]):
            if val in df[col1][idx]:
                new_df[val] += int(df[col2][idx])
    new_df = pd.DataFrame(pd.Series(new_df)).reset_index()
    new_df.columns = [col1, col2]
    new_df.sort_values('count', ascending=False, inplace=True)
    return new_df

# Define all relevant answers for 'JobProfile'
possible_answers = ["LinkedIn", "Indeed", "Monster", "Hired.com", "Dice", "Talent.io",
                    "StepStone", "Xing", "Reed.co.uk", "Xing", "CW Jobs", "Total Jobs", 
                    "Glassdoor", "JobSite.co.ik", "Remix jobs", "Naukri", "Pracuj"]
        
# Execute function for the dataframe
df_jobprofile = total_count(df_jobprofile, 'JobPortal', 'count', possible_answers)

# Add column for percentage
df_jobprofile['percent'] = df_jobprofile['count']*100/np.sum(df_jobprofile['count'])
df_jobprofile.head(100)

In [None]:
# Visualize results with a bar chart
df_jobprofile.sort_values(by='count', ascending=False).plot('JobPortal', 'percent', kind="bar", legend=False);

In [None]:
# New DataFrame for participants with current residence in Germany
df_germany = df[df['Country'] == 'Germany']
df_germany['Country'].value_counts()

In [None]:
# Same approach as above: List all possible answers for 'JobPortal' in a new DataFrame
df_job_germany = df_germany['JobProfile'].value_counts().reset_index()
df_job_germany.rename(columns={'index': 'JobPortal', 'JobProfile': 'count'}, inplace=True)
df_job_germany.head(100)


In [None]:
# Same as above: Split multiple answers using total_count

# Execute function for the dataframe
df_job_germany = total_count(df_job_germany, 'JobPortal', 'count', possible_answers)

# Add column for percentage
df_job_germany['percent'] = df_job_germany['count']*100/np.sum(df_job_germany['count'])
df_job_germany.head(100)

In [None]:
# Bar chart of TOP 5 answers for Germany
df_job_germany = df_job_germany[:5].copy()
df_job_germany.sort_values(by='count', ascending=False).plot('JobPortal', 'count', kind="bar", figsize=(15,5), legend=False, color="blue");
plt.title('TOP 5 Job Portals (Germany)', fontsize=15, color='black');

In [None]:
# To match the results above, also only show top 5 answers for all countries.
df_jobprofile = df_jobprofile[:5].copy()
df_jobprofile.sort_values(by='count', ascending=False).plot('JobPortal', 'count', kind="bar", figsize=(15,5), legend=False, color="blue")
plt.title('TOP 5 Job Portals (all countries)', fontsize=15, color='black');

## Findings
* Bla
* Bla