In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
import plotly.express as px
import squarify

# Defining an import related function
def format_y_tick(value, pos):
    return '{:.0f}'.format(value/1000000) + 'M'

In [4]:
firm_employment_size = pd.read_csv('firm_employment_size.csv')
firm_statistics = pd.read_csv('firms_statistics.csv')
years_in_business = pd.read_csv('years_in_business.csv')

# Cleaning null and non-uniform values from columns




In [5]:
firm_statistics.drop(['GEO_ID','SEX','NAICS2017','ETH_GROUP','RACE_GROUP','VET_GROUP','YEAR'], inplace=True, axis=1)
firm_statistics.drop('VET_GROUP_LABEL', inplace=True, axis=1)
firm_statistics.drop(['ETH_GROUP_LABEL', 'RACE_GROUP_LABEL'], inplace=True, axis=1)
firm_statistics.drop('SEX_LABEL', inplace=True, axis=1)
firm_statistics.drop_duplicates(inplace=True)
firm_statistics.drop('NAME', inplace=True, axis=1)


# This is just a note for the titles of the columns I'm working with 
# ===============================
# firm_statistics columns =  == NAICS2017_LABEL == FIRMPDEMP == RCPPDEMP == EMP == PAYANN

working_df = firm_statistics



# Here I'm finding the rows I'll need to work with


In [6]:
rows_to_keep = []
indicies = []
for i, val in enumerate(firm_statistics.NAICS2017_LABEL):
    if val not in rows_to_keep:
        rows_to_keep.append(val)
        indicies.append(i)
    else:
        working_df


a_ver = zip(rows_to_keep, indicies)
for line in a_ver:
    print(line)

('Meaning of NAICS code', 0)
('Total for all sectors', 1)
('Agriculture, forestry, fishing and hunting(660)', 246)
('Mining, quarrying, and oil and gas extraction', 402)
('Utilities', 580)
('Construction', 730)
('Manufacturing', 953)
('Wholesale trade', 1181)
('Retail trade', 1394)
('Transportation and warehousing(661)', 1614)
('Information', 1835)
('Finance and insurance(662)', 2042)
('Real estate and rental and leasing', 2239)
('Professional, scientific, and technical services', 2448)
('Management of companies and enterprises', 2679)
('Administrative and support and waste management and remediation services', 2850)
('Educational services', 3063)
('Health care and social assistance', 3253)
('Arts, entertainment, and recreation', 3472)
('Accommodation and food services', 3661)
('Other services (except public administration)(663)', 3876)
('Industries not classified', 4087)


# Here is the Magic!

I created a new dataframe where I pulled each row and its values based on industry. This will make it easier to create some charts.

In [7]:
working_df = pd.DataFrame(columns=['NAICS2017_LABEL', 'FIRMPDEMP', 'EMP', 'PAYANN'])

x = 0
container = []
for i in indicies:
    values_to_add = []
    for line in firm_statistics.loc[indicies[x]]:
        if line == str(line):
            values_to_add.append(line)
    values_to_add.pop(2)
    container.append(values_to_add)
    if len(container) == 22:
        container = container[1:]
        break
    x += 1
    
    
x = 0
for i in range(len(rows_to_keep)):    
    working_df.loc[len(working_df)] = container[x]
    x += 1
    if x == 21:
        break
    
    
working_df


Unnamed: 0,NAICS2017_LABEL,FIRMPDEMP,EMP,PAYANN
0,Total for all sectors,5771292,128898226,7227585564
1,"Agriculture, forestry, fishing and hunting(660)",27208,472472,12052120
2,"Mining, quarrying, and oil and gas extraction",18874,614422,60364863
3,Utilities,5995,642237,72052468
4,Construction,731108,7216259,470583419
5,Manufacturing,244297,11985574,735297352
6,Wholesale trade,291450,6352296,472657018
7,Retail trade,633160,15743993,470642268
8,Transportation and warehousing(661),197025,5385468,279568169
9,Information,82279,3463102,393115627


# Verifying the data in the rows is correct

by running the code below we can conclude that the total is equal to the sum of each
turns out it's off by 1 dollar.

In [8]:
x = working_df['PAYANN'][0]
x = int(x)
for i in working_df['PAYANN'][1:]:
    i = int(i)
    x -= i
print(x)

-1


# Converting Datatypes

In [9]:
working_df['EMP'] = working_df['EMP'].astype(int)
working_df['FIRMPDEMP'] = working_df['FIRMPDEMP'].astype(int)
working_df['PAYANN'] = working_df['PAYANN'].astype(float)

# Adjusting Values to fit better in the charts 

# Annual Salary Payout by Industry

In [84]:
new_df = working_df.drop(0)
labels2 = [label[:20] + "..." if len(label) > 30 else label for label in new_df['NAICS2017_LABEL']]
fig = px.treemap(new_df,
                 path=['NAICS2017_LABEL'],
                 values='PAYANN',
                 color='NAICS2017_LABEL',
                 color_discrete_sequence=px.colors.qualitative.Dark2,
                 width=1600,
                 height=800,
                 title='Employee Payout By Industry'
                )

fig.update_layout(
    margin=dict(l=100, r=100, t=40, b=100),
    font=dict(size=14),
    legend=dict(orientation="v", yanchor="bottom", y=1.5, xanchor="right", x=2),
    uniformtext=dict(minsize=20),
    plot_bgcolor='white'
)

fig.update_traces(hovertemplate='<b>%{label}</b><br>Annual Employment Payout: $%{value:,.0f}')
fig.data[0].textinfo = 'label+text+value'
fig.show()



# Number of employees by Industry

In [85]:
new_df = working_df.drop(0)
labels2 = [label[:20] + "..." if len(label) > 30 else label for label in new_df['NAICS2017_LABEL']]
fig = px.treemap(new_df,
                 path=['NAICS2017_LABEL'],
                 values='EMP',
                 color='NAICS2017_LABEL',
                 color_discrete_sequence=px.colors.qualitative.Dark2,
                 width=1600,
                 height=800,
                 title='Employees per Industry'
                )

fig.update_layout(
    margin=dict(l=100, r=100, t=40, b=100),
    font=dict(size=14),
    legend=dict(orientation="v", yanchor="bottom", y=1.5, xanchor="right", x=2),
    uniformtext=dict(minsize=20),
    plot_bgcolor='white'
)

fig.update_traces(hovertemplate='<b>%{label}</b><br>Total Employees: %{value:,.0f}')
fig.data[0].textinfo = 'label+text+value'
fig.show()



# Pay Per Employee
Amount averaged per employee

In [111]:
working_df['Pay_per_Employee'] = working_df['PAYANN'] / working_df['EMP']
grouped = working_df.groupby('NAICS2017_LABEL').mean()['Pay_per_Employee']
new_df = pd.DataFrame(grouped)
new_df = new_df.sort_values(by='Pay_per_Employee', ascending=False)
new_df['Pay_per_Employee'] *= 1000

fig = px.bar(new_df,
             x='Pay_per_Employee',
             y=new_df.index,
             orientation='h',
             title='Average Pay per Employee by Industry',
             color='Pay_per_Employee',
             color_continuous_scale='Earth')
fig.update_layout(xaxis_title='Average Pay per Employee ($)',
                  yaxis_title='Industry',
                  yaxis_categoryorder='total ascending',
                  margin=dict(l=100, r=100, t=40, b=100))
fig.update_coloraxes(colorbar=dict(tickformat=".0f"))
fig.update_traces(hovertemplate='<b>%{y}</b><br>Average Pay per Employee: $%{x:,.0f}K')
for i, row in enumerate(new_df.itertuples()):
    fig.add_annotation(x=row.Pay_per_Employee,
                       y=row.Index,
                       text=f'{row.Pay_per_Employee/1000:,.0f}K',
                       showarrow=False,
                       xshift=15)
    
fig.show()
