In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels
import pickle
import plotly.express as px
import plotly.offline as py
import plotly.graph_objs as go

# clean_national_data = pd.read_pickle(".\\Cleaned\\clean_national_data.pkl")
# clean_state_data = pd.read_pickle(".\\Cleaned\\clean_state_data.pkl")
# work_stop = pd.read_pickle(".\\PrelimEDA\\work_stop.pkl")
# minwagestate = pd.read_pickle(".\\PrelimEDA\\minwagestate.pkl")
# finalfull = pd.read_pickle(".\\PrelimEDA\\finalfull.pkl")
# final_earnings = pd.read_pickle(".\\PrelimEDA\\final_earnings.pkl")

# otherStates = {k:'Other' for k in ['East Coast States', 'Nationwide', 'Interstate']}
# industryCounts = work_stop['iTitle'].value_counts().reset_index().rename({'index': 'Industry', 'iTitle': 'Counts'}, axis = 1)
# stateCounts = pd.Series(np.concatenate(work_stop['States'])).str.strip().replace(otherStates)
# stateCounts = pd.Series(np.where(stateCounts == "", None, stateCounts)).value_counts().reset_index().rename({'index': 'State', 0: 'Counts'}, axis = 1)
#  #Need to look and see what they mean by nationwide, east coast states, and interstate

# earnInd = final_earnings.groupby(['industry_name'])['AvgWeeklyEarnings'].mean().sort_values(ascending = False)
# earnInd = earnInd.reset_index()

# earnState = final_earnings.groupby(['StateCode'])['AvgWeeklyEarnings'].mean().sort_values(ascending = False)
# earnState = earnState.reset_index()


# Part 1 | Data Cleaning and Transformation:

### Generalized Data Cleaning for Ease of Use:

In [4]:
#This option is to prevent pandas from truncating columns that are strings.
#Old versions of pandas may need -1 instead of None
pd.set_option('display.max_colwidth', None)


#This specifies how many days of employement data we require before a work stoppage. 
#Right now it is set to six months, meaning we are only working
#with work stoppages where we have at least six months of data before the work
#stoppage began and six months of data after the work stoppage ended.
time_window = pd.Timedelta(180,"days")


#This specifies to print out messages while processing the data.
be_verbose = True

In [7]:
#This cell loads in all the data.
#The cleaned data is stored later, so this doesn't need to be rerun unless we're improving the data.


#This excel file contains data on each work stoppage.
#Industry is by 2017 NAICS code. 
#This data is from 1988 to 2020.
#We changed the xlsx file to a xls file because of compatibility issues with pandas reading a xlsx file with xlrd.
#This file is originally from https://www.bls.gov/web/wkstp/monthly-listing.xlsx
work_stoppage_df = pd.read_excel(".\WorkStoppage\work_stop_monthly.xls", 
    header=1, skipfooter=6, dtype={"Industry code[1]":int}   )
#There is an entry or two with the states list empty, we replace the NaN value with an empty string.
work_stoppage_df.fillna("", inplace=True)

#This text file contains a table with info about each industry type.
#We use it to convert the NAICS industry code of work_stoppage_df to the industry code used 
#in the Current Employment Statistics files. This doesn't give a perfect match up,
#so we have to match many of the entries by hand.
#This file is originally from https://download.bls.gov/pub/time.series/ce/ce.industry
industry_lookup_df = pd.read_csv(".\CurrentEmploymentStats\ce.industry", sep="\t")

#This text file contains info about each series_id.
#We use it to turn a BLS industry code into a Current Employment Statistic series_id.
#This file is orginally from https://download.bls.gov/pub/time.series/ce/ce.series
current_employment_series_df = pd.read_csv(".\CurrentEmploymentStats\ce.series.txt", sep="\t", header=0,
    names=['series_id', 'supersector_code', 'industry_code',
       'data_type_code', 'seasonal', 'series_title', 'footnote_codes',
       'begin_year', 'begin_period', 'end_year', 'end_period'],
     converters={'series_id':str.strip} )
#The header=0 and names is to fix some white space issues with the column names.
#The converter is to fix white space issues with the series_id values.
#We restrict this data set to only the rows for average weekly earnings of all employees, 
#which is data_type_code 11, and we use the seasonally adjusted data (seasonally adjusted 
#is good for comparing monthly data, whereas unadjusted is good for comparing yearly data).
#Depending on what we do, we might want to switch to seasonable data or even use both.
#Non-adjusted is 'U' instead of 'S'
current_employment_series_df = current_employment_series_df[
    (current_employment_series_df["data_type_code"]==11)
    &(current_employment_series_df["seasonal"]=='S')]

#This text file contains the value for the each Current Employment Statistic.
#This data set is from 1939 to 2021, but not for all series. It is very spotty.
#This file is originally from https://download.bls.gov/pub/time.series/ce/ce.data.0.AllCESSeries
current_employment_statistic_df = pd.read_csv(".\CurrentEmploymentStats\ce.data.0.AllCESSeries.txt", 
    sep="\t", header=0, 
    names=['series_id', 'year', 'period', 'value','footnote_codes'],
    converters={'series_id':str.strip} )
#The header=0 and names is to fix some white space issues with the column names.
#The converter is to fix white space issues with the series_id values.



#The datasets oe.data.0.Current and oe.data.1.AllData are only for 2020, so we can't use them for much.
#occupation_employment_df = pd.read_csv(".\OccEmployment\oe.data.0.Current", sep="\s+")
#occupation_employment_df1 = pd.read_csv(".\OccEmployment\oe.data.1.AllData", sep="\s+")





#Below are some data frames for state level data.

#This text file is for states_metro_employment_series. It has information about the
#series in the entries of sa.data.0.Current. Unfortunately, the industry data is all 
#over the place with this data set. Using this might require a lot of data matching done by hand,
#it doesn't even look like we can easily pull average wage data for an entire state.
state_series_df = pd.read_csv(".\sa.series", delim_whitespace=True,
    names= ['series_id', 'state_code', 'area_code', 'industry_code', 'detail_code',
       'data_type_code', 'seasonal', 'benchmark_year', 'begin_year',
       'begin_period', 'end_year', 'end_period'],
      header=None, skiprows=1, index_col=False )              
#We restrict to data_type_code 4, which  is Average Weekly Earnings In Dollars    
state_series_df = state_series_df[ (state_series_df["data_type_code"]==4) ]


#This text file contains the actual data for a given series.
states_metro_employment_stats = pd.read_csv(".\StateMetroEmployment\sa.data.0.Current.csv", sep="\s+")
#This uses SIC code for industry, or so they say. It doesn't look to match the actual SIC codes.
#This isn't currently in use, because of matching the data with the work stoppage data.

In [8]:
#This is a bunch of hand matched codes based on the cell below.
#This was matched based on the values in 2-6 digit_2017_Codes.xlsx
#and ce.industry.
naics_to_ce_industry = {
92:90922920,
923:60541612,
3152:32315280,
21221:10212200,
22121:44221200,
22131:None,
23731:20237300,
31212:32329140,
31523:32315280,
32721:31327200,
33341:31333400,
33421:None,
33441:31334400,
33451:None,
33612:None,
33621:31336200,
33641:31336400,
33651:31336900,
48521:43485500,
48831:43488390,
48849:43488400,
49211:43492100,
51711:50517000,
61111:65611100,
61121:65611200,
61131:65611300,
62111:65621100,
62210:65622100,
62211:65622100,
62311:65623100,
71111:70711190,
92211:None,
92214:None,
92313:None,
211111:10211000,
212112:10212113,
212230:10212200,
212231:10212200,
212234:10212200,
221110:44221110,
221210:44221200,
236000:20236000,
236200:20236200,
236220:20236220,
237310:20237300,
237990:20237000,
238140:20238140,
238160:20238160,
238210:20238210,
238220:20238220,
238320:20238320,
238350:20238350,
238910:20238910,
311313:None,
311320:None,
311812:32311813,
313312:32313000,
315299:32315280,
325180:32325180,
325221:32325211,
325222:32325211,
326199:32326190,
326210:32326210,
326211:32326210,
331110:31331100,
331111:31331100,
331310:31331300,
331312:31331300,
331513:31331510,
332112:31331400,
332913:32326120,
332992:31332994,
333111:None,
333611:31333600,
333618:None,
333921:None,
333996:None,
334290:31334200,
334612:None,
335222:31335200,
335224:31335200,
335931:31335930,
336120:31336100,
336212:31336214,
336300:31336300,
336321:31336320,
336322:31336320,
336330:31336330,
336350:31336350,
336360:31336360,
336410:31336400,
336414:31336419,
336510:31336900,
336900:None,
336992:None,
424410:41424410,
441110:42441110,
445110:42445110,
481111:43481100,
482111:None,
484210:43484210,
485110:43485500,
485111:43485500,
485112:43482000,
485113:43485500,
485310:43485310,
485991:43485900,
488190:43488100,
488310:43488390,
488320:43488320,
488330:43488390,
488490:43488400,
512110:50512110,
517110:50517000,
524114:55524110,
561612:60561613,
561720:60561720,
561920:60561920,
562111:60562100,
562219:60562219,
611110:65611100,
611111:65611100,
611210:65611200,
611310:65611300,
621610:65611610,
622110:65622100,
622210:65622200,
624110:65624110,
624410:65624400,
721110:70721110,
721120:70721120,
722510:70722500,
921100:None,
921110:None,
921111:None,
921190:None}

In [9]:
#This cell goes through the work stoppage data frame and tries to match it up with the CE data
#The data is written to a pickle file, so this does not need to be rerun, unless we're 
#improving the data.

#For each work stoppage:
#    Get the BLS industry code from the work stoppage NAICS code. 
#        This usually fails, so we record the NAICS codes we still need to match.
#    Get the relevant CES series id from the BLS industry code. 
#        This fails some of the time, but I don't think there's anyhing to be done
#        about it. The data just isn't there.
#    If there is data for the CES series that is from before the work stoppage (at least time_window days), then
#        we record the series id. This we can use to look up whatever data we want.
#        Since this data is at the national level, we don't bother separating by state.
#    The initial run keeps track of the NAICS codes that weren't matched at all and then these
#        are matched later by hand. So on the second run, everything is matched that can be matched.
#        The matches are stored in the dictionary naics_to_ce_industry.


rows_to_add = []
naics_codes_to_match = []
for index, row in work_stoppage_df.iterrows():
    naics_code = row["Industry code[1]"]
    start_date = row["Work stoppage beginning date"]  
    end_date = row["Work stoppage ending date"]  

    industry_code = industry_lookup_df[ str(naics_code)==industry_lookup_df["naics_code"] ]["industry_code"] 
    if len(industry_code)!=0:#Did we get an industry code for free?
        industry_code = industry_code.iloc[0]
    else:#Do we have a match done by hand?
        industry_code = naics_to_ce_industry[naics_code]
    
    
    if not industry_code is None:
        series_id = current_employment_series_df[ 
            current_employment_series_df["industry_code"]==industry_code]["series_id"]
        if len(series_id)==0:
            if be_verbose:
                print("No series data available for this industry code.")
        elif len(series_id)>1:
            if be_verbose:
                print("Multiple series data available for this industry code. Weird.")
        else:
            series_id = series_id.iloc[0]
            wage_data = current_employment_statistic_df[
                current_employment_statistic_df["series_id"]==series_id]
 
            #Is there sufficient data from before the work stoppage began?
            #This is controlled by the time_window variable.
            ce_year = int(min(wage_data["year"]))
            ce_month = int(min(wage_data[wage_data["year"]==ce_year]["period"])[1:])
            ce_date = pd.Timestamp(year=ce_year,month=ce_month,day=1)
            earlier = start_date-time_window
    
            #do we got data?
            if earlier >= ce_date:
                print("We have some data to use!")
                organization = row['Organizations involved']                    
                areas = row['Areas']
                ownership = row['Ownership']
                states = row["States"].split(",")
                rows_to_add.append([organization, states,  areas, ownership, naics_code, 
                    start_date, end_date, series_id] )
            else:
                if be_verbose:
                    print("No data is available before the work stoppage.")
    else:
        #Load these into a dictionary and try to match by hand.
        if be_verbose:
            print(f"Here's a NAICS code we should try to match:{naics_code}")
        naics_codes_to_match.append(naics_code)
                
clean_national_data = pd.DataFrame( data=rows_to_add,
    columns=["organization", "states", "areas", "ownership", 
        "naics industry code", "start date", "end date", "series_id"] )

clean_national_data.to_pickle(".\\Cleaned\\clean_national_data.pkl")

No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No series data available for this industry code.
No series data available for this industry code.
No series data available for this industry code.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No series data available for this industry code.
No series data available for this industry code.
No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work s

No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
Here's a NAICS code we should try to match:33451
No series data available for this industry code.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
Here's a NAICS code we should try to match:33612
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
Here's a NAICS code we should try to match:333618
Here's a NAICS code we should try to match:33421
No series data available for this industry code.
No data is available before the work stoppage.
No series data available for this industr

No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code.
Here's a NAICS code we should try to match:333921
No data is available before the work stoppage.
No series data available for this industry code.
No series data available for this industry code.
No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No data is available before the work stoppage.
No data is available before the work stoppage.
No data is available before the work stoppage.
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code.
No data is available before the work stoppage.
No series data available for this industry co

We have some data to use!
We have some data to use!
No series data available for this industry code.
We have some data to use!
No series data available for this industry code.
No series data available for this industry code.
We have some data to use!
No series data available for this industry code.
We have some data to use!
No series data available for this industry code.
We have some data to use!
We have some data to use!
We have some data to use!
We have some data to use!
We have some data to use!
No series data available for this industry code.
No series data available for this industry code.
We have some data to use!
We have some data to use!
Here's a NAICS code we should try to match:921100
No series data available for this industry code.
No series data available for this industry code.
No series data available for this industry code.
No series data available for this industry code.
We have some data to use!
We have some data to use!


In [11]:
sa_state_code_to_abbr = {
1:"AL",
2:"AK",
4:"AZ",
5:"AR",
6:"CA",
8:"CO",
9:"CT",
10:"DE",
11:"DC",
12:"FL",
13:"GA",
15:"HI",
16:"ID",
17:"IL",
18:"IN",
19:"IA",
20:"KS",
21:"KY",
22:"LA",
23:"ME",
24:"MD",
25:"MA",
26:"MI",
27:"MN",
28:"MS",
29:"MO",
30:"MT",
31:"NE",
32:"NV",
33:"NH",
34:"NJ",
35:"NM",
36:"NY",
37:"NC",
38:"ND",
39:"OH",
40:"OK",
41:"OR",
42:"PA",
43:"PR",
44:"RI",
45:"SC",
46:"SD",
47:"TN",
48:"TX",
49:"UT",
50:"VT",
51:"VA",
52:"VI",
53:"WA",
54:"WV",
55:"WI",
56:"WY"
}

In [12]:
#This dictionary turns a NAICS code to the industry code of the sa data.
#These were all done by hand.
naics_to_sa_industry = {
62:None,
92:None,
236:200001,
237:215026,
322:426002,
336:337002,
517:548103,
622:880603,
623:880556,
923:None,
2211:None,
2362:215403,
3118:420503,
3141:422002,
3152:423002,
3221:426156,
3315:333203,
3324:334103,
3331:335303,
3361:337114,
4243:653026,
4244:651403,
4841:542103,
5311:765002,
21221:110002,
22112:None,
22121:549203,
22131:None,
23731:216103,
23811:217703,
23812:217703,
23814:217403,
23822:217103,
23829:215009,
23831:None,
23832:217203,
23835:217503,
31212:None,
31523:423002,
32721:332002,
33341:650703,
33421:548136,
33422:336536,
33441:336744,
33451:338136,
33593:336403,
33612:337144,
33621:337103,
33632:337103,
33641:337203,
33651:337403,
42482:651803,
44111:655103,
44511:654103,
44812:656203,
48412:542103,
48521:541002,
48831:544002,
48832:None,
48849:None,
49211:542103,
51111:427103,
51512:548303,
51711:548103,
53112:None,
54111:None,
54181:873103,
56172:None,
61111:882103,
61121:938224,
61131:882203,
62111:880103,
62210:880603,
62211:880603,
62311:880503,
71111:None,
72111:870103,
72112:870128,
92211:None,
92214:None,
92313:None,
211111:113002,
212112:112203,
212230:110002,
212231:110002,
212234:110002,
221110:None,
221112:None,
221122:549103,
221210:549203,
236000:215002,
236200:215403,
236220:215403,
237310:216103,
237990:215026,
238140:217403,
238160:217603,
238210:217303,
238220:217103,
238320:217203,
238350:217503,
238910:None,
311313:None,
311320:None,
311611:420114,
311615:420114,
311812:420503,
312111:420803,
313312:422002,
315299:423002,
321911:None,
325180:428103,
325211:428203,
325221:428203,
325222:428203,
326199:430803,
326210:430103,
326211:430103,
331110:333124,
331111:333124,
331310:None,
331312:333254,
331513:333254,
332112:334609,
332913:650703,
332992:None,
333111:335203,
333415:217103,
333611:335103,
333618:335103,
333921:None,
333996:None,
334290:336603,
334612:None,
335222:336303,
335224:336303,
335931:336434,
336111:337103,
336120:337109,
336212:337109,
336300:337144,
336321:337144,
336322:337144,
336330:337144,
336350:337144,
336360:337144,
336410:337203,
336411:337203,
336412:337203,
336414:337236,
336510:337403,
336611:337303,
336900:337009,
336992:None,
424410:651414,
441110:655303,
445110:654103,
481111:545103,
482111:540002,
484121:None,
484122:None,
484210:None,
485110:541002,
485111:541002,
485112:540002,
485113:541002,
485310:541002,
485991:None,
488190:545002,
488310:544002,
488320:None,
488330:None,
488490:None,
512110:None,
517110:548002,
517311:548002,
524114:763203,
561612:None,
561720:None,
561920:None,
562111:None,
562219:None,
611110:882103,
611111:882103,
611210:938224,
611310:882203,
621111:880103,
621491:763203,
621610:880556,
622110:None,
622210:None,
624110:883503,
624410:883503,
711211:None,
721110:870103,
721120:870128,
722510:658002,
921100:939133,
921110:939133,
921111:939133,
921190:949009
}

In [13]:
#This cell goes through the work stoppage data frame and tries to match it up with the SA data
#The data is written to a pickle file, so this does not need to be rerun, unless we're 
#improving the data.

#For each work stoppage:  
#    Get the SA industry code from the work stoppage NAICS code.
#        These are stored in the dictionary naics_to_sa_industry.
#    Get the relevant SA series ids from the SA industry code. Since this is state level data,
#    we also require that the SA series is for a state appearing in the list of states 
#    for the work stoppage. 
#        This fails some of the time, but I don't think there's anyhing to be done
#        about it. The data just isn't there.
#        If a work stoppage occurred in multiple states and there is data for multiple states,
#        we record each different state data in a separate row.
#        The translation of an SA state code to a work stoppage state abbreviation is done via
#        the dictionary sa_state_code_to_abbr.
#    If there is data for the SA series that is from before the work stoppage (at least time_window days), then
#        we record the series id. This we can use to look up whatever data we want.


rows_to_add = []
for index, row in work_stoppage_df.iterrows():
    naics_code = row["Industry code[1]"]
    states = row["States"]
    start_date = row["Work stoppage beginning date"]  
    end_date = row["Work stoppage ending date"]  

    industry_code = naics_to_sa_industry[naics_code]     
    if not industry_code is None:
        series_ids = state_series_df[ state_series_df.apply(
            lambda x: x["industry_code"]==industry_code and sa_state_code_to_abbr[x["state_code"]] in states, 
            axis=1)
        ]["series_id"]       

        if len(series_ids)==0:
            if be_verbose:
                print("No series data available for this industry code in the relevant states.")
        else:
            for series_id in series_ids:
                wage_data = states_metro_employment_stats[
                    states_metro_employment_stats["series_id"]==series_id]
        
                #Sometimes a valid series_id does not have any data.
                if len(wage_data)==0:
                    if be_verbose:
                        print("There is no data available for this series.")
                else:
                    #Is there sufficient data from before the work stoppage began?
                    #This is controlled by the time_window variable.
                    ce_year = int(min(wage_data["year"]))
                    ce_month = int(min(wage_data[wage_data["year"]==ce_year]["period"])[1:])
                    ce_date = pd.Timestamp(year=ce_year,month=ce_month,day=1)
                    earlier = start_date-time_window
    
                    #do we got data?
                    if earlier >= ce_date:
                        if be_verbose:
                            print("We have some data to use!")
                        organization = row['Organizations involved']                    
                        areas = row['Areas']
                        ownership = row['Ownership']
                        state = state_series_df[state_series_df.series_id==series_id]["state_code"]
                        state = sa_state_code_to_abbr[state.iloc[0]]
                        rows_to_add.append([organization, state, areas, ownership, naics_code, 
                            start_date, end_date, series_id] )
                    else:
                        print("No data is available before the work stoppage.")
    else:
        if be_verbose:
            print(f"Here's a NAICS code we could try to match:{naics_code}")
                
clean_state_data = pd.DataFrame( data=rows_to_add,
    columns=["organization", "state", "areas", "ownership", 
        "naics industry code", "start date", "end date", "series_id"] )

clean_state_data.to_pickle(".\\Cleaned\\clean_state_data.pkl")

Here's a NAICS code we could try to match:488330
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
There is no data available for this series.
There is no data available for this series.
There is no data available for this series.
There is no data available for this series.
No data is available before the work stoppage.
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No data is available before the work stoppage.
There is no data available for this series.
No data is available before the work st

There is no data available for this series.
No series data available for this industry code in the relevant states.
No data is available before the work stoppage.
No series data available for this industry code in the relevant states.
No data is available before the work stoppage.
No data is available before the work stoppage.
No data is available before the work stoppage.
No series data available for this industry code in the relevant states.
Here's a NAICS code we could try to match:321911
No series data available for this industry code in the relevant states.
Here's a NAICS code we could try to match:92
No series data available for this industry code in the relevant states.
No data is available before the work stoppage.
Here's a NAICS code we could try to match:54111
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No 

We have some data to use!
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
We have some data to use!
There is no data available for this series.
We have some data to use!
We have some data to use!
No series data available for this industry code in the relevant states.
We have some data to use!
Here's a NAICS code we could try to match:31212
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the rel

We have some data to use!
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
Here's a NAICS code we could try to match:562219
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
We have some data to use!
We have some data to use!
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
We have some data to use!
No series data available for this industry code in the 

We have some data to use!
There is no data available for this series.
Here's a NAICS code we could try to match:622110
Here's a NAICS code we could try to match:622110
Here's a NAICS code we could try to match:622110
No series data available for this industry code in the relevant states.
We have some data to use!
Here's a NAICS code we could try to match:622110
We have some data to use!
We have some data to use!
Here's a NAICS code we could try to match:622110
No series data available for this industry code in the relevant states.
Here's a NAICS code we could try to match:622110
We have some data to use!
We have some data to use!
No series data available for this industry code in the relevant states.
Here's a NAICS code we could try to match:622110
No series data available for this industry code in the relevant states.
No series data available for this industry code in the relevant states.
Here's a NAICS code we could try to match:622110
No series data available for this industry code 

In [14]:
#Run this is we're just loading the data instead of loading and cleaning from scratch.
clean_national_data = pd.read_pickle(".\\Cleaned\\clean_national_data.pkl")

#Run this is we're just loading the data instead of loading and cleaning from scratch.
clean_state_data = pd.read_pickle(".\\Cleaned\\clean_state_data.pkl")

In [15]:
#Let's see what we can do with this data now:
display(clean_national_data.head())
print(f"There are {len(clean_national_data)} rows to consider. Let's see the first 5\n\n")
for j in range(0,5):
    row = clean_national_data.iloc[j]
    start_date = row['start date']
    series_id = row["series_id"]
    data = current_employment_statistic_df[
        (current_employment_statistic_df["series_id"]==series_id)
        &(current_employment_statistic_df["period"]!="M13")]
    #M13 is for the annual average

    earlier = start_date-time_window
    later = start_date+time_window 
    annoying = lambda row : pd.Timestamp(year=int(row["year"]), month=int(row["period"][1:]),day=1)
    data = data[ (data.apply(annoying,axis=1)>=earlier)
               & (data.apply(annoying,axis=1)<=later)]

    print(f"The work stoppage at {row['organization']} started on {row['start date']} "
        + f"and ended on {row['end date']}. The associated wage data is as follows.")          
    display(data)
    print(f"\n\n")

Unnamed: 0,organization,states,areas,ownership,naics industry code,start date,end date,series_id
0,Bombardier Learjet,[KS],Wichita,Private industry,336411,2006-10-02,2006-10-23 00:00:00,CES3133641111
1,"ABM Janitorial Services, GCA Services Group, Sanitors Services of Texas, Pritchard Industries Southwest",[TX],Houston,Private industry,561720,2006-10-23,2006-11-20 00:00:00,CES6056172011
2,National Fire Sprinkler Association,[Nationwide],,Private industry,238220,2007-04-01,2007-04-16 00:00:00,CES2023822011
3,"Associated Wall and Ceiling Contractors of Oregon and Southwest Washington, Inc.","[OR, WA]",Multiple states,Private industry,236220,2007-06-01,2007-06-19 00:00:00,CES2023622011
4,National Electrical Contractors of America,[WA],Puget Sound,Private industry,238210,2007-06-01,2007-06-01 00:00:00,CES2023821011


There are 111 rows to consider. Let's see the first 5


The work stoppage at Bombardier Learjet started on 2006-10-02 00:00:00 and ended on 2006-10-23 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
890215,CES3133641111,2006,M05,1292.05,
890216,CES3133641111,2006,M06,1284.65,
890217,CES3133641111,2006,M07,1330.34,
890218,CES3133641111,2006,M08,1350.22,
890219,CES3133641111,2006,M09,1323.73,
890220,CES3133641111,2006,M10,1337.55,
890221,CES3133641111,2006,M11,1375.03,
890222,CES3133641111,2006,M12,1394.96,
890223,CES3133641111,2007,M01,1422.1,
890224,CES3133641111,2007,M02,1428.85,





The work stoppage at ABM Janitorial Services, GCA Services Group, Sanitors Services of Texas, Pritchard Industries Southwest started on 2006-10-23 00:00:00 and ended on 2006-11-20 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
2960996,CES6056172011,2006,M05,341.89,
2960997,CES6056172011,2006,M06,337.27,
2960998,CES6056172011,2006,M07,337.84,
2960999,CES6056172011,2006,M08,339.3,
2961000,CES6056172011,2006,M09,342.81,
2961001,CES6056172011,2006,M10,344.56,
2961002,CES6056172011,2006,M11,343.04,
2961003,CES6056172011,2006,M12,353.8,
2961004,CES6056172011,2007,M01,346.51,
2961005,CES6056172011,2007,M02,348.53,





The work stoppage at National Fire Sprinkler Association started on 2007-04-01 00:00:00 and ended on 2007-04-16 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
303796,CES2023822011,2006,M11,836.26,
303797,CES2023822011,2006,M12,838.5,
303798,CES2023822011,2007,M01,844.49,
303799,CES2023822011,2007,M02,847.48,
303800,CES2023822011,2007,M03,850.88,
303801,CES2023822011,2007,M04,856.09,
303802,CES2023822011,2007,M05,864.75,
303803,CES2023822011,2007,M06,871.5,
303804,CES2023822011,2007,M07,871.13,
303805,CES2023822011,2007,M08,879.92,





The work stoppage at Associated Wall and Ceiling Contractors of Oregon and Southwest Washington, Inc. started on 2007-06-01 00:00:00 and ended on 2007-06-19 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
179732,CES2023622011,2007,M01,1026.91,
179733,CES2023622011,2007,M02,1024.78,
179734,CES2023622011,2007,M03,1019.46,
179735,CES2023622011,2007,M04,1040.66,
179736,CES2023622011,2007,M05,1041.94,
179737,CES2023622011,2007,M06,1042.8,
179738,CES2023622011,2007,M07,1042.08,
179739,CES2023622011,2007,M08,1034.45,
179740,CES2023622011,2007,M09,1037.62,
179741,CES2023622011,2007,M10,1039.35,





The work stoppage at National Electrical Contractors of America started on 2007-06-01 00:00:00 and ended on 2007-06-01 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
297648,CES2023821011,2007,M01,917.67,
297649,CES2023821011,2007,M02,915.92,
297650,CES2023821011,2007,M03,923.93,
297651,CES2023821011,2007,M04,923.93,
297652,CES2023821011,2007,M05,935.04,
297653,CES2023821011,2007,M06,934.14,
297654,CES2023821011,2007,M07,933.07,
297655,CES2023821011,2007,M08,939.55,
297656,CES2023821011,2007,M09,944.12,
297657,CES2023821011,2007,M10,953.67,







In [16]:
#Let's see what we can do with this data now:
display(clean_state_data.head())
print(f"There are {len(clean_state_data)} rows to consider. Let's see the first 5\n\n")
for j in range(0,5):
    row = clean_state_data.iloc[j]
    start_date = row['start date']
    series_id = row["series_id"]
    data = states_metro_employment_stats[ (states_metro_employment_stats["series_id"]==series_id)
        &(states_metro_employment_stats["period"]!="M13")]
    #M13 is for the annual average

    earlier = start_date-time_window
    later = start_date+time_window 
    annoying = lambda row : pd.Timestamp(year=int(row["year"]), month=int(row["period"][1:]),day=1)
    data = data[ (data.apply(annoying,axis=1)>=earlier)
               & (data.apply(annoying,axis=1)<=later)]

    print(f"The work stoppage at {row['organization']} started on {row['start date']} "
        + f"and ended on {row['end date']}. The associated wage data is as follows.")          
    display(data)
    print(f"\n\n")

Unnamed: 0,organization,state,areas,ownership,naics industry code,start date,end date,series_id
0,Detroit Newspapers,MI,Detroit,Private industry,51111,1995-07-13,1997-02-19,SAU2600004271034
1,Detroit Newspapers,MI,Detroit,Private industry,51111,1995-07-13,1997-02-19,SAU2621604271034
2,Boeing Company,KS,Multiple states,Private industry,336411,1995-10-06,1995-12-14,SAU2000003372034
3,General Electric Corp.,PA,Erie,Private industry,336510,1995-10-26,1995-10-30,SAU4200003374034
4,"Chrysler Corp., McGraw Glass Division",MI,Detroit,Private industry,3361,1995-11-01,1995-11-04,SAU2600003371144


There are 113 rows to consider. Let's see the first 5


The work stoppage at Detroit Newspapers started on 1995-07-13 00:00:00 and ended on 1997-02-19 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
1217384,SAU2600004271034,1995,M02,396.14,
1217385,SAU2600004271034,1995,M03,399.67,
1217386,SAU2600004271034,1995,M04,403.51,
1217387,SAU2600004271034,1995,M05,413.45,
1217388,SAU2600004271034,1995,M06,409.9,
1217389,SAU2600004271034,1995,M07,406.56,
1217390,SAU2600004271034,1995,M08,392.47,
1217391,SAU2600004271034,1995,M09,387.64,
1217392,SAU2600004271034,1995,M10,386.97,
1217393,SAU2600004271034,1995,M11,402.12,





The work stoppage at Detroit Newspapers started on 1995-07-13 00:00:00 and ended on 1997-02-19 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
1277512,SAU2621604271034,1995,M02,353.38,
1277513,SAU2621604271034,1995,M03,348.42,
1277514,SAU2621604271034,1995,M04,358.75,
1277515,SAU2621604271034,1995,M05,351.65,
1277516,SAU2621604271034,1995,M06,359.67,
1277517,SAU2621604271034,1995,M07,358.87,
1277518,SAU2621604271034,1995,M08,297.6,
1277519,SAU2621604271034,1995,M09,293.68,
1277520,SAU2621604271034,1995,M10,305.97,
1277521,SAU2621604271034,1995,M11,316.87,





The work stoppage at Boeing Company started on 1995-10-06 00:00:00 and ended on 1995-12-14 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
980645,SAU2000003372034,1995,M05,752.6,
980646,SAU2000003372034,1995,M06,748.78,
980647,SAU2000003372034,1995,M07,712.8,
980648,SAU2000003372034,1995,M08,703.34,
980649,SAU2000003372034,1995,M09,726.97,
980650,SAU2000003372034,1995,M10,460.46,
980651,SAU2000003372034,1995,M11,628.85,
980652,SAU2000003372034,1995,M12,564.2,
980654,SAU2000003372034,1996,M01,808.75,
980655,SAU2000003372034,1996,M02,822.74,





The work stoppage at General Electric Corp. started on 1995-10-26 00:00:00 and ended on 1995-10-30 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
2101097,SAU4200003374034,1995,M05,681.36,
2101098,SAU4200003374034,1995,M06,677.16,
2101099,SAU4200003374034,1995,M07,683.76,
2101100,SAU4200003374034,1995,M08,685.03,
2101101,SAU4200003374034,1995,M09,683.35,
2101102,SAU4200003374034,1995,M10,682.49,
2101103,SAU4200003374034,1995,M11,688.8,
2101104,SAU4200003374034,1995,M12,691.56,
2101106,SAU4200003374034,1996,M01,668.05,
2101107,SAU4200003374034,1996,M02,693.31,





The work stoppage at Chrysler Corp., McGraw Glass Division started on 1995-11-01 00:00:00 and ended on 1995-11-04 00:00:00. The associated wage data is as follows.


Unnamed: 0,series_id,year,period,value,footnote_codes
1207392,SAU2600003371144,1995,M06,1029.13,
1207393,SAU2600003371144,1995,M07,1016.29,
1207394,SAU2600003371144,1995,M08,998.2,
1207395,SAU2600003371144,1995,M09,1055.26,
1207396,SAU2600003371144,1995,M10,1041.3,
1207397,SAU2600003371144,1995,M11,1048.32,
1207398,SAU2600003371144,1995,M12,1051.18,
1207400,SAU2600003371144,1996,M01,967.15,
1207401,SAU2600003371144,1996,M02,1004.39,
1207402,SAU2600003371144,1996,M03,853.16,







### Data Cleaning and Transformations for Visualizations:

In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels
import pickle
import plotly.express as px
from scipy import stats as sts
from scipy.stats import pearsonr
from statsmodels.stats.multicomp import pairwise_tukeyhsd

clean_national_data = pd.read_pickle(".\\Cleaned\\clean_national_data.pkl")
clean_state_data = pd.read_pickle(".\\Cleaned\\clean_state_data.pkl")
work_stop = pd.read_pickle(".\\PrelimEDA\\work_stop.pkl")
minwagestate = pd.read_pickle(".\\PrelimEDA\\minwagestate.pkl")

minwagestate = pd.read_csv("/Users/nyssacornelius/Desktop/COMP4477/FProj/FProj/min_wage_state.csv", usecols=["Year", "State", "Federal.Minimum.Wage", "Effective.Minimum.Wage","Effective.Minimum.Wage.2020.Dollars","CPI.Average"])


minwagestate = minwagestate[(minwagestate['State']!= 'District of Columbia') &\
                          (minwagestate['State']!= 'U.S. Virgin Islands') &\
                              (minwagestate['State']!= 'Country Of Mexico') &\
                                  (minwagestate['State']!= 'Puerto Rico') &\
                                  (minwagestate['State']!= 'Guam')]


# CLEAN UP DATA:
    #Minimum wage:
minwagestate['State'] = minwagestate['State'].astype('string')

us_state_abbrev = {'Alabama': 'AL', 'Alaska': 'AK',
'Arizona': 'AZ','Arkansas': 'AR',
'California': 'CA',
    'Colorado': 'CO',
    'Connecticut': 'CT',
    'Delaware': 'DE',
    'Florida': 'FL',
    'Georgia': 'GA',
    'Hawaii': 'HI',
    'Idaho': 'ID',
    'Illinois': 'IL',
    'Indiana': 'IN',
    'Iowa': 'IA',
    'Kansas': 'KS',
    'Kentucky': 'KY',
    'Louisiana': 'LA',
    'Maine': 'ME',
    'Maryland': 'MD',
    'Massachusetts': 'MA',
    'Michigan': 'MI',
    'Minnesota': 'MN',
    'Mississippi': 'MS',
    'Missouri': 'MO',
    'Montana': 'MT',
    'Nebraska': 'NE',
    'Nevada': 'NV',
    'New Hampshire': 'NH',
    'New Jersey': 'NJ',
    'New Mexico': 'NM',
    'New York': 'NY',
    'North Carolina': 'NC',
    'North Dakota': 'ND',
    'Ohio': 'OH',
    'Oklahoma': 'OK',
    'Oregon': 'OR',
    'Pennsylvania': 'PA',
    'Rhode Island': 'RI',
    'South Carolina': 'SC',
    'South Dakota': 'SD',
    'Tennessee': 'TN',
    'Texas': 'TX',
    'Utah': 'UT',
    'Vermont': 'VT',
    'Virginia': 'VA',
    'Washington': 'WA',
    'West Virginia': 'WV',
    'Wisconsin': 'WI',
    'Wyoming': 'WY'
}                                                           

minwagestate['StateCode'] = minwagestate['State'].map(us_state_abbrev)                                                             

#Work stoppage:
work_stop = work_stop.rename(columns={'Days idle, cumulative for this work stoppage[3]': 'TotalDaysIdle', 'Industry code[1]': 'IndustryCode', 'Work stoppage beginning date': 'StartDate', 'Work stoppage ending date': 'EndDate'})

# #Remove the weird [4] and make the column an integer data type:
work_stop['TotalDaysIdle'] = pd.to_numeric(work_stop['TotalDaysIdle'], errors='coerce', downcast='integer')
work_stop['TotalDaysIdle'] = work_stop['TotalDaysIdle'].astype('Int64')

# #Fix workstop end date:
work_stop['EndDate'] = pd.to_datetime(work_stop['EndDate'], errors='coerce', format = '%Y-%m-%d')

# #Column for duration of work stoppage:
#     #Represents number of days
work_stop['WSDuration'] = (work_stop['EndDate'] - work_stop['StartDate'])/np.timedelta64(1,'D')+1
work_stop['WSDuration'] = work_stop['WSDuration']+1

#Change states from string to list of strings:
work_stop['States'] = work_stop['States'].str.split(",")

#Minimum wage state data:
# minwagestate.hist(column='Effective.Minimum.Wage')
# minwagestate.boxplot(column='Effective.Minimum.Wage', by = ["State"], rot = 75)


#Work Stoppage state data:
    #Need to quantify data by state using list comprehension as states are in lists:
#Are these two essentially the same? Neither are truly normal, left skewed.
# work_stop.hist(column='WSDuration')
# work_stop.hist(column = "TotalDaysIdle")
    
    
#INDUSTRY DATA INFORMATION:
    #Willing to go down to 3-digit NAICS code, if not fruitful then up to only 2-digit NAICS
    #Best file will likely be 2-6 digit NAICS code xlsx file

iCodes = pd.read_csv('/Users/nyssacornelius/Desktop/COMP4477/FProj/FProj/2017NAICS_Codes2_6digit.csv', header=0, usecols=[0,1,2], skiprows=[1])
iCodes = iCodes.rename(columns={'2017 NAICS US   Code': 'NAICS_Code2017', '2017 NAICS US Title': 'IndustryTitle'})

#Remove wonky unicode character:
iCodes['IndustryTitle'] = iCodes['IndustryTitle'].str.replace('\ufffd', '')

#Extract only codes with 2-digits:
codes2digit = iCodes.loc[iCodes['NAICS_Code2017'].str.contains('^\d{2}$'), ['NAICS_Code2017']].values

#Extract only 3-digit codes:
codes3digit = iCodes.loc[iCodes['NAICS_Code2017'].str.contains('^\d{3}$'), ['NAICS_Code2017']].values

#Apply a dictionary to all 3-digit codes:
# iCodes['2digit'] = iCodes['NAICS_Code2017'].str.extract(r'(\d{2})')
iCodes['3digit'] = iCodes['NAICS_Code2017'].str.extract(r'(\d{3})')

#Industry code abbreviated for work stop:
work_stop['iCodeAb'] = work_stop['IndustryCode'].astype(str).str.extract(r'(\d{3})')

# #fill in nans:
work_stop.loc[work_stop['iCodeAb'].isnull(), ['iCodeAb']] = work_stop.loc[work_stop['iCodeAb'].isnull(), ['IndustryCode']].values

# #Copy column:
work_stop['iTitle'] = work_stop['iCodeAb'].astype(str)

# #Get name from industry code:
work_stop['iTitle'] = work_stop['iTitle'].replace({k:v for k, v in zip(iCodes['NAICS_Code2017'], iCodes['IndustryTitle'])})


#Industry strikes aggregated by count:
industryCounts = work_stop['iTitle'].value_counts().reset_index().rename({'index': 'Industry', 'iTitle': 'Counts'}, axis = 1)

#Frequency of strikes by state:
otherStates = {k:'Other' for k in ['East Coast States', 'Nationwide', 'Interstate']}
stateCounts = pd.Series(np.concatenate(work_stop['States'])).str.strip().replace(otherStates)
stateCounts = pd.Series(np.where(stateCounts == "", None, stateCounts)).value_counts().reset_index().rename({'index': 'State', 0: 'Counts'}, axis = 1)


#Begin State-Metro Employment and Wage Data work:
smdata = pd.read_csv('https://download.bls.gov/pub/time.series/sm/sm.data.1.AllData', sep = '\t')

smdata.shape
smdata.columns

#Remove whitespace:
smdata.columns = smdata.columns.str.strip()
smdata['series_id'] = smdata['series_id'].str.strip()
smdata['value'] = smdata.value.astype(str).str.strip()

#Convert value to float

smdata['state_code'] = smdata['series_id'].str.extract(r'(\d{2})')
smdata = smdata.loc[~smdata['state_code'].isin(['00','11','72','78','99'])]

#Create columns for data types in series id:
smdata['data_type'] = smdata['series_id'].str.extract(r'(\d{2}$)')
smdata.head(20)

data_types = {'01': 'Employees',
              '11': 'AvgWeeklyEarnings'
              }

smdata = smdata.loc[smdata['data_type'].isin(data_types.keys())]
smdata['data_type'] = smdata['data_type'].replace(data_types)
smdata['value'] = np.where(smdata['value'] == '-', np.nan, smdata['value'])
smdata['value'] = smdata['value'].astype(float)

smdata['industry_code'] = smdata['series_id'].str.extract(r'\d{7}(\d{5})')+'000'


indCode = pd.read_csv('https://download.bls.gov/pub/time.series/sm/sm.industry', sep = '\t', dtype = {'industry_code': str})

mergedat = pd.merge(smdata, indCode, on = 'industry_code', how = 'left')

mergedat.loc[mergedat['industry_code'] != '00000000']
mergedat.loc[mergedat['industry_code'] == '90930000']


manInd = ['Utilities', 'Transportation and Warehousing', 'Professional, Scientific, and Technical Services',
          'State Government', 'Indian Tribes', 'Logging', 'Educational Services', 'Federal Government',
          'Federal Government', 'Federal Government', 'Local Government']
manIndreplace = {k:v for k,v in zip(mergedat.loc[mergedat['industry_name'].isnull(), 'industry_code'].unique(), manInd)}

mergedat.industry_code.replace(manIndreplace, inplace = True)
mergedat.loc[mergedat['industry_name'].isnull(), 'industry_name'] = mergedat.loc[mergedat['industry_name'].isnull(), 'industry_code']

stateCodes = pd.read_csv('https://download.bls.gov/pub/time.series/sm/sm.state', sep = '\t', dtype = {'state_code': str})

final_data = pd.merge(mergedat, stateCodes, on = 'state_code', how = 'left')

final_data.isnull().sum()
final_data.state_name.value_counts()

final_data.drop(['series_id', 'period', 'footnote_codes', 'state_code', 'industry_code'], axis = 1, inplace = True)

#Make pivot table
finalfull = final_data.pivot_table(index = ['state_name','industry_name','year'], columns = 'data_type', values = 'value', aggfunc = 'mean').reset_index()
finalfull.columns = finalfull.columns.str.strip()
final_earnings = finalfull.copy()
final_earnings = final_earnings.dropna().reset_index()
final_earnings = final_earnings.drop(labels = 'index', axis = 1)

final_earnings['StateCode'] = final_earnings['state_name'].map(us_state_abbrev)
finalfull['StateCode'] = finalfull['state_name'].map(us_state_abbrev)


#Need to group by industry and state, respectively and take the average over all years since 2007
earnInd = final_earnings.groupby(['industry_name'])['AvgWeeklyEarnings'].mean().sort_values(ascending = False)
earnInd = earnInd.reset_index()

earnState = final_earnings.groupby(['StateCode'])['AvgWeeklyEarnings'].mean().sort_values(ascending = False)
earnState = earnState.reset_index()

#Education and Health services have two of the highest rates of strike - in mid-low range of average weekly earnings
#However, this could be skewed by professionals in health industry that make a great deal more
#Can't know from this data
#Leisure is lowest average weekly earnings


cormin = minwagestate.corr(method='spearman')
#years increase eff min in 2020 $ decreases, obviously CPI increases almost 1:1
#Min wage 2020 $s decreases as years increase
#Min wage 2020 $s decreases as CPI average increases


# work_stop.to_pickle(".\\PrelimEDA\\work_stop.pkl")
# finalfull.to_pickle(".\\PrelimEDA\\finalfull.pkl")
# final_earnings.to_pickle(".\\PrelimEDA\\final_earnings.pkl")
# minwagestate.to_pickle(".\\PrelimEDA\\minwagestate.pkl")

# final_earnings.industry_name.value_counts()

# finalfull.to_csv('finalfull.csv', index = False)
# final_earnings.to_csv('final_earnings.csv', index = False)

FileNotFoundError: [Errno 2] No such file or directory: '.\\PrelimEDA\\work_stop.pkl'

#  Part 2 | Data Visualizations:

## Minimum Wage by State:

In [None]:
df = minwagestate

#Can add back in to show difference, but this sets scale for all years
#range_color = (df['Effective.Minimum.Wage'].min(), df['Effective.Minimum.Wage'].max())

#Without, scale is set on year-by-year basis:
fig1 = px.choropleth(df, locations = df['StateCode'], locationmode = 'USA-states',
 color = 'Effective.Minimum.Wage', color_continuous_scale = "speed",
 scope = 'usa', labels = {'Effective.Minimum.Wage': 'Effective Minimum Wage', 'Effective.Minimum.Wage.2020.Dollars': '2020 Dollars Equivalent', 'CPI.Average': 'Average Consumer Price Index'}, hover_name = 'State',
 hover_data = {'StateCode': False, 'State': False, 'Effective.Minimum.Wage.2020.Dollars': True, 'CPI.Average': True,'Effective.Minimum.Wage': True, 'Year': False},
                    animation_frame = 'Year')

fig1.update_layout(title = 'Minimum Wage by State Since 1968')
fig1.show()

## Average Weekly Earnings by State:

In [None]:
df = earnState
fig1 = px.choropleth(df, locations = df['StateCode'], locationmode = 'USA-states',
 color = 'AvgWeeklyEarnings', color_continuous_scale = "speed",
 scope = 'usa', labels = {'AvgWeeklyEarnings': 'Average Weekly Earnings'}, hover_name = 'StateCode',
 hover_data = {'StateCode': False},)

fig1.update_layout(title = 'Average Weekly Earnings by State Since 2007')
fig1.show()

## Strikes by State:

In [None]:
df = stateCounts
fig1 = px.choropleth(df, locations = df['State'], locationmode = 'USA-states',
 color = 'Counts', color_continuous_scale = "speed",
 scope = 'usa', labels = {'Counts': 'Number of Strikes'}, hover_name = 'State',
 hover_data = {'State': False},)

fig1.update_layout(title = 'Strikes by State Since 1988')
fig1.show()

In [None]:
#Bar chart of number of strikes by State since 1988:
fig = px.bar(stateCounts, x='State', y='Counts',
             hover_data=['Counts', 'State'], color='Counts',
             labels={'Counts':'Number of Strikes'},
             height=400, width = 1030, color_continuous_scale = 'speed',
            title = 'Number of Strikes Since 1988 by State')
fig.update_xaxes(
        tickangle = 75)
fig.update_layout(plot_bgcolor = 'white')
fig.show()

## Strikes by Industry:

In [None]:
#Bar chart of number of strikes by Industry since 1988:
fig = px.bar(industryCounts, x='Industry', y='Counts',
             hover_data=['Counts', 'Industry'], color='Counts',
             labels={'Counts':'Number of Strikes'},
             height=1030, width = 1050, color_continuous_scale = 'solar_r',
            title = 'Number of Strikes Since 1988 by Industry', orientation = 'v')
fig.update_xaxes(
        tickangle = 65)
fig.update_layout(plot_bgcolor = 'black')
fig.show()
#Do states with lower minimum wage tend to have more strikes?

## Average Weekly Earnings by Industry Since 2007:

In [None]:
#Need to possibly add income by industry data here:
#Bar chart of number of Average Weekly Earnings by Industry:
fig = px.bar(earnInd, x = 'industry_name', y = 'AvgWeeklyEarnings',
             hover_data=['AvgWeeklyEarnings', 'industry_name'], color = 'AvgWeeklyEarnings',
             labels={'AvgWeeklyEarnings':'Average Weekly Earnings', 'industry_name': 'Industry'},
             color_continuous_scale = 'speed', height = 750, width = 1000,
             title = 'Average Weekly Earnings by Industry since 2007')
fig.update_xaxes(
        tickangle = 75)
fig.update_layout(plot_bgcolor = 'white')
fig.show()

# Part 3 | Statistical Tests:

In [None]:
#One way ANOVA on States Minimum Wage:
stateminwage = []
for state in list(minwagestate.State.unique()):
    stateminwage.append(list(minwagestate.loc[minwagestate['State'] == state,
                                              'Effective.Minimum.Wage.2020.Dollars']))

fvalue, pvalue = sts.f_oneway(*stateminwage)

m_comp = pairwise_tukeyhsd(endog=minwagestate['Effective.Minimum.Wage.2020.Dollars'], groups=minwagestate['State'],
                           alpha=0.05)
print(m_comp.summary())

#Have to look at in text file, too many groups:
with open('mcomp_stateminwage.txt', 'w') as f:
        print(m_comp.summary(), file=f)


#One way ANOVA on Avg Wage States:
avgwagestate = []
for state in list(final_earnings.state_name.unique()):
    avgwagestate.append(list(final_earnings.loc[final_earnings['state_name'] == state,
                                              'AvgWeeklyEarnings']))

fvalue_statewage, pvalue_statewage = sts.f_oneway(*avgwagestate)

m_comp_statewage = pairwise_tukeyhsd(endog=final_earnings['AvgWeeklyEarnings'], groups=final_earnings['state_name'],
                           alpha=0.05)

#Have to look at in text file, too many groups:
with open('mcomp_statewage.txt', 'w') as f:
        print(m_comp_statewage.summary(), file=f)

#One way ANOVA for Avg Wage by Industry:
avgwageindustry = []
for industry in list(final_earnings.industry_name.unique()):
    avgwageindustry.append(list(final_earnings.loc[final_earnings['industry_name'] == industry,
                                              'AvgWeeklyEarnings']))

fvalue_industrywage, pvalue_industrywage = sts.f_oneway(*avgwageindustry)

m_comp_industrywage = pairwise_tukeyhsd(endog=final_earnings['AvgWeeklyEarnings'], groups=final_earnings['industry_name'],
                           alpha=0.05)

with open('mcomp_industrywage.txt', 'w') as f:
        print(m_comp_industrywage.summary(), file=f)

#Subtract minwage from effective, if 0 = minwage state
#Mean of 0s and 1s for a state by comparing state min wage
#state min wage > fed min wage = 1, else 0
#mean > .4 = 1, else 0 for minwage state


#df: states, strikes, minwage or not
minwagestate['MinWageStatus'] = np.where((minwagestate['Effective.Minimum.Wage']-minwagestate['Federal.Minimum.Wage']) > 0, 1, 0)
state_status = minwagestate.groupby(['State'])['MinWageStatus'].mean().reset_index()
state_status['MinWageStatus'] = np.round(state_status['MinWageStatus'], 2)

state_status['MinWageStatus'] = np.where(state_status['MinWageStatus'] >= 0.4, 'GreaterMinWage', 'MinWage')

state_status.MinWageStatus.value_counts()

#Get full names for stateCounts in statestrikes:
statestrikes = stateCounts.copy()
statestrikes.State.replace({v:k for k,v in us_state_abbrev.items()}, inplace = True)

#Merge on state_status state, WY and SD are missing, need to be dropped:
wagestrike = pd.merge(state_status, statestrikes, on = 'State', how = 'left').dropna()

#One way ANOVA for MinWage and Strikes by State:
minornot = []
for status in list(wagestrike.MinWageStatus.unique()):
    minornot.append(list(wagestrike.loc[wagestrike['MinWageStatus'] == status,
                                              'Counts']))

fvalue_strikes, pvalue_strikes = sts.f_oneway(*minornot)

m_comp_strikes = pairwise_tukeyhsd(endog=wagestrike['Counts'], groups=wagestrike['MinWageStatus'],
                           alpha=0.05)

print(m_comp_strikes.summary())
#would reject because not significant
#however, minimum wage states do have less strikes, but it's not significantly different
#could still be due to chance