In [8]:
# include visualization modules
import numpy as np
import pandas as pd
import cufflinks as cf
import matplotlib.pyplot as plt
import pandas_bokeh as pb
import geopandas as gpd
import warnings
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
import plotly.graph_objects as go
from bokeh.models import BasicTickFormatter
from bokeh.io import show
cf.go_offline()
init_notebook_mode(connected='true')
pb.output_notebook()
warnings.filterwarnings("ignore")

In [9]:
# read in data
salary_data = pd.read_csv("./Salary/SofwareDeveloperIncomeExpensesperUSACity.csv")


In [10]:
# Query 1: (Attributes relationship mining) Query of the relationship between house price and salary 
home_price, salary_value, time_dict = [], [], {'SDE Year Cost': [], 'All Year Cost': []}
for i in range(len(salary_data)):
    home_price.append(salary_data['Median Home Price'][i])
    salary_value.append(salary_data['Mean Software Developer Salary (unadjusted)'][i])
    time_dict['SDE Year Cost'].append(salary_data['Median Home Price'][i] / salary_data['Mean Software Developer Salary (unadjusted)'][i])
    time_dict['All Year Cost'].append(salary_data['Median Home Price'][i] / salary_data['Mean Unadjusted Salary (all occupations)'][i])

q1_dataframe = pd.DataFrame({
    'home_price': home_price,
    'salary_value': salary_value
})
p1 = q1_dataframe.plot_bokeh.scatter(
    x = 'salary_value',
    y = ['home_price'],
    xlabel = 'Average Salary',
    ylabel = 'Home Price',
    title = 'Average room price statistics for some Euro cities',
)

q1_2_dataframe = pd.DataFrame(dict(sorted([(k, pd.Series(v)) for k, v in time_dict.items()])))
p1_2 = q1_2_dataframe.iplot(kind="box", title="Time cost distribution for employees to buy a house")

In [11]:
# Query 2: (Livelihood level ranking) Query of the money saving speed of programmers in America
cities, salarys, costs, savings = [], [], [], []
for i in range(len(salary_data)):
    cities.append(salary_data['City'][i])
    salarys.append(salary_data['Mean Software Developer Salary (unadjusted)'][i])
    costs.append(salary_data['Cost of Living Plus Rent avg'][i] * 12)
    savings.append(salary_data['Mean Software Developer Salary (unadjusted)'][i] - salary_data['Cost of Living Plus Rent avg'][i] * 12)

q2_dataframe = pd.DataFrame({
    'cities': cities,
    'salarys': salarys,
    'costs': costs,
    'savings': savings
})
p2 = q2_dataframe.sort_values(by='savings', ascending=False).head(30).plot_bokeh(
    kind = 'barh',
    x = 'cities',
    y = ['salarys', 'costs', 'savings'],
    xlabel = 'Money Amount',
    ylabel = 'Salary Metrics',
    title = 'Average Money Saving Circumstance for SDEs among cities in Ameriva',
)

In [12]:
p2.xaxis[0].formatter = BasicTickFormatter(use_scientific=False)
show(p2)

In [13]:
# Query 3: (Geometry related data distribution) Query of the number of occupation and salary provided in different states of America
city_dict, salary_dict = {}, {}
for i in range(len(salary_data)):
    state_name = salary_data['City'][i].split(',')[-1].strip()
    if state_name not in city_dict:
        city_dict[state_name] = salary_data['Number of Software Developer Jobs'][i]
        salary_dict[state_name] = salary_data['Mean Software Developer Salary (unadjusted)'][i] * salary_data['Number of Software Developer Jobs'][i]
    else:
        city_dict[state_name] += salary_data['Number of Software Developer Jobs'][i]
        salary_dict[state_name] += salary_data['Mean Software Developer Salary (unadjusted)'][i] * salary_data['Number of Software Developer Jobs'][i]
state_list, count_list, salary_list = [], [], []
for k, v in city_dict.items():
    state_list.append(k)
    count_list.append(v)
    salary_list.append(salary_dict[k] / v)
df = pd.DataFrame({
    'state_list': state_list,
    'count_list': count_list,
    'salary_list': salary_list
})

In [14]:
fig1 = go.Figure(data = go.Choropleth(
    locations=df['state_list'],
    z = df['count_list'].astype(int),
    locationmode = 'USA-states',
    colorscale = 'Greens',
    colorbar_title = "Occupation Count",
))

fig1.update_layout(
    width = 800,
    title_text = 'Number of Software Developer Jobs of America',
    geo_scope='usa',
)

In [15]:
fig2 = go.Figure(data = go.Choropleth(
    locations=df['state_list'],
    z = df['salary_list'].astype(float),
    locationmode = 'USA-states',
    colorscale = 'Purples',
    colorbar_title = "Salary Level",
))

fig2.update_layout(
    width = 800,
    title_text = 'Average salary of SDEs of America',
    geo_scope='usa',
)