In [196]:
import pandas as pd
import os
import matplotlib as mpl
import matplotlib.pyplot as plt

In [197]:

def read_data(file_name: str, hypothesis_flag) -> pd.DataFrame:
    """
    Load file containg details of LCA and retain only useful columns.
    Replace the column names with meaningful heads.
    :param filename: The path to the LCA data file.
    """

    col_required = ['STATUS', 'CASE_STATUS', 'LCA_CASE_EMPLOYER_NAME', 'EMPLOYER_NAME', 'TOTAL_WORKERS',
                    'TOTAL_WORKER_POSITIONS',
                    'LCA_CASE_WORKLOC1_STATE', 'VISA_CLASS', 'LCA_CASE_NUMBER', 'LCA_CASE_NAICS_CODE', 'NAICS_CODE',
                    'TOTAL WORKERS', 'WORKSITE_STATE', 'WORKSITE_STATE_1', 'CASE_NUMBER', 'NAIC_CODE']

    df = pd.read_csv(filepath_or_buffer=file_name, usecols=lambda x: x in col_required,
                     dtype={'LCA_CASE_NAICS_CODE': 'str',
                            'NAICS_CODE': 'str',
                            'NAIC_CODE': 'str',
                            },
                     low_memory=False, encoding='ISO-8859-1')

    df = df.rename(
        columns={'LCA_CASE_NUMBER': 'CASE_NUMBER', 'CASE_STATUS': 'STATUS', 'LCA_CASE_EMPLOYER_NAME': 'EMPLOYER_NAME',
                 'TOTAL_WORKER_POSITIONS': 'TOTAL_WORKERS', 'LCA_CASE_WORKLOC1_STATE': 'WORKSITE_STATE',
                 'LCA_CASE_NAICS_CODE': 'NAICS_CODE', 'WORKSITE_STATE_1': 'WORKSITE_STATE', 'NAIC_CODE': 'NAICS_CODE',
                 'TOTAL WORKERS': 'TOTAL_WORKERS'})
    if hypothesis_flag == False:
            df['STATUS'] = df["STATUS"].str.upper()
            df = df[(df['STATUS'] == 'CERTIFIED') & (df['VISA_CLASS'] == 'H-1B')]
            df['NAICS_CODE'] = df['NAICS_CODE'].str[:2]
            return (df)
    elif hypothesis_flag == True:
            df['STATUS'] = df["STATUS"].str.upper()            
            df['NAICS_CODE'] = df['NAICS_CODE'].str[:2]
            df = df[df['VISA_CLASS'] == 'H-1B']
            return (df)

In [198]:
def sector_range(row):
    if isinstance(row, list) and len(row) > 1:
        return list(range(int(row[0]), int(row[1]) + 1))
    elif isinstance(row, list):
        return row[0]

def read_sector_data (filename : str) -> pd.DataFrame:
    """
    Load the NAICS data file, retaining only the most useful columns & rows.
    Change the layout for a few rows to make the data inclusive of a codes.
    :param filename: The path to the NAICS code data file.
 
    """
    sector_df = pd.read_csv(filepath_or_buffer=filename,encoding='ISO-8859-1')
    sector_df["Sector"]=sector_df["Sector"].str.split("-")
    sector_df['sector_range'] = sector_df["Sector"].apply(sector_range)
    sector_codes = sector_df['sector_range'].apply(pd.Series).reset_index().melt(id_vars='index').dropna()[['index', 'value']].set_index('index')
    sector_codes_final = sector_codes.merge(sector_df['Name'], left_index=True, right_index=True, how='inner')
    sector_codes_final = sector_codes_final.rename(columns={"value": "Sector"})
    sector_codes_final['Sector']= sector_codes_final['Sector'].astype('int8')
    return(sector_codes_final)

def hypothesis_one_cal(year_df, sector_data_df, yy):
    year_df =year_df.merge(sector_data_df, how='left', left_on='NAICS_CODE', right_on='Sector')
    stats_df = year_df.groupby(['Name'])['TOTAL_WORKERS'].sum().astype('int32').reset_index(name=yy)
    return( stats_df)

def hypothesis_one(file_list):
    sector_df = read_sector_data ('2017_NAICS_Structure_Summary_Table.csv')
    sector_df['Sector'] = sector_df['Sector'].astype('str')
    sector_name = sector_df.Name.unique().tolist()
    plot_data_df = pd.DataFrame()
    plot_data_df['Sectors']= sector_name
    for file in file_list:
        file_name = "data_H1B/" + file
        hypothesis_flag = False
        year_data = read_data (file_name, hypothesis_flag)
        year = '20' + file[7:9]
        stats = hypothesis_one_cal(year_data,sector_df,year)
        plot_data_df = plot_data_df.merge(stats, how='left', left_on='Sectors', right_on='Name')
        del plot_data_df['Name']
    return(plot_data_df)

In [199]:
def hypothesis_two(directory):
    country = ['China - mainland', 'China - Taiwan','India', 'Korea, South', 'Mexico','Brazil', 'Australia', 'Russia','Great Britain and Northern Ireland',
                'Germany','France', 'Philippines']
    sd= pd.DataFrame({"Nationality": country})
    for file in directory:
        col_name = 'Fiscal Year 20' + file[2:4]
        file_name = "data_Country/" + file
        df = pd.read_csv(filepath_or_buffer=file_name,thousands=',', dtype= {'H-1B':'float'})
        df=df.rename(columns={'Unnamed: 0': 'Visa_Country', col_name: 'Visa_Country'})
        df= df[['Visa_Country','H-1B']]
        df= df.rename(columns ={'H-1B': col_name})
        sd =sd.merge(df, how='left',  left_on= 'Nationality', right_on='Visa_Country')
        del sd['Visa_Country']
        plot_hypothesis_two(sd)
    return (sd)
def plot_hypothesis_two(data_plot):
    data_plot = data_plot.set_index('Nationality')
    data_plot = data_plot.T
    data_plot = data_plot.pct_change()
    data_plot.plot.barh(legend = False)

In [200]:
def df_hypothesis_three(file_list):
    company_df = pd.read_csv("companylist.csv", dtype={'MarketCap': 'float64'})
    final_df = pd.DataFrame()
    list_of_df = []
    for file in file_list:
        file_name = "data_H1B/" + file
        hypothesis_flag = True
        year_data = read_data (file_name, hypothesis_flag)
        year = '20' + file[7:9]
        company_df['name_lower'] = company_df['Name'].str.lower()
        year_data['EMPLOYER_NAME_lower'] = year_data['EMPLOYER_NAME'].str.lower()
        companylist_merged_df = year_data.merge(company_df, left_on='EMPLOYER_NAME_lower', right_on='name_lower')
        companylist_merged_df_certified = pd.DataFrame()
        companylist_merged_df_total = pd.DataFrame()
        companylist_merged_df_total = companylist_merged_df.groupby(['EMPLOYER_NAME','MarketCap'])['TOTAL_WORKERS'].sum().reset_index(name="TOTAL_WORKERS_OVERALL")
        companylist_merged_df_certified = companylist_merged_df[companylist_merged_df['STATUS'] == 'CERTIFIED'].groupby(['EMPLOYER_NAME','MarketCap'])['TOTAL_WORKERS'].sum().reset_index(name="CERTIFIED_TOTAL_WORKERS")
        companylist_merged_df_certified['year'] = year
        companylist_merged_df_total['year'] = year
        new_data = companylist_merged_df_certified.merge(companylist_merged_df_total, on='EMPLOYER_NAME')  
        new_data['MarketCap_x'].astype('float64')
        new_data['Rate'] = (new_data['CERTIFIED_TOTAL_WORKERS'] / new_data['TOTAL_WORKERS_OVERALL'])*100
        new_data = new_data[['EMPLOYER_NAME', 'Rate', 'MarketCap_x','year_x']]
        #print(new_data.sort_values(by='Rate', ascending=False))
        list_of_df.append(new_data)
    final_df = pd.concat(list_of_df,ignore_index=True)
    return final_df   

In [201]:
def hypothesis_two_state_cal(states_data_df, year_df, yy):
    year_df =year_df.merge(states_data_df, how='left', left_on='WORKSITE_STATE', right_on='Abbreviation')
    stats_df = year_df.groupby(['State'])['TOTAL_WORKERS'].sum().reset_index(name=yy)
    return( stats_df)

def hypothesis_two_state(file_list):
    states_df = pd.read_csv('states.csv')
    state_name = states_df.State.unique().tolist()
    plot_data_df = pd.DataFrame()
    plot_data_df['Worksite State']= state_name
    for file in file_list:
        file_name = "data_H1B/" + file
        hypothesis_flag = False
        year_data = read_data (file_name, hypothesis_flag)
        year = '20' + file[7:9]
        stats = hypothesis_two_state_cal(states_df,year_data, year)
        plot_data_df = plot_data_df.merge(stats, how='left', left_on='Worksite State', right_on='State')
        del plot_data_df['State']
    plot_data_df = plot_data_df.set_index('Worksite State')
    plot_data_df = plot_data_df.apply(lambda s: pd.Series(s.nlargest(5).index))
    
    return(plot_data_df)

In [202]:

if __name__ == '__main__':
    path = "data_H1B/"
    directory = os.listdir(path)
    #path2 = "data_Country/"
    #directory2 =os.listdir(path2)
    #df = hypothesis_one(directory)
    #sd= hypothesis_two(directory2)
    #df = hypothesis_two_state(directory)
    #df = hypothesis_one(directory)
    hypothesis3_df=df_hypothesis_three(directory)   

In [8]:
df

Unnamed: 0_level_0,2011,2012,2013,2014,2015,2016,2017,2018,2019,2020
Worksite State,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
Alabama,2063.0,1845.0,1611.0,2513.0,2385.0,2396,2321,2916,203.0,1675
Alaska,303.0,233.0,255.0,291.0,207.0,313,215,187,9.0,80
Arizona,9254.0,11840.0,13094.0,13195.0,19412.0,16330,14573,14619,1849.0,12725
Arkansas,4043.0,4478.0,4323.0,5315.0,6848.0,5368,4087,4429,587.0,4747
California,114603.0,138504.0,158011.0,168098.0,203570.0,221926,245286,300444,33722.0,249368
Colorado,9492.0,10937.0,13678.0,11920.0,14856.0,12849,13113,13674,1713.0,10874
Connecticut,15470.0,20645.0,17196.0,19875.0,24132.0,21433,15152,16316,1838.0,11026
Delaware,4101.0,5045.0,6182.0,4946.0,7653.0,6014,5187,6983,399.0,3139
District of Columbia,4853.0,4618.0,4439.0,5733.0,5450.0,6591,5865,5912,794.0,4470
Florida,24464.0,26797.0,28648.0,28949.0,38414.0,42834,37777,44073,5210.0,22389


In [203]:
hypothesis3_df

Unnamed: 0,EMPLOYER_NAME,Rate,MarketCap_x,year_x
0,"ABIOMED, INC.",100.000000,1.620249e+10,2011
1,ACCURAY INCORPORATED,100.000000,4.481857e+08,2011
2,"ACI WORLDWIDE, INC.",90.909091,3.577972e+09,2011
3,"ACORDA THERAPEUTICS, INC.",75.000000,6.472690e+08,2011
4,ADESTO TECHNOLOGIES CORPORATION,100.000000,1.654199e+08,2011
...,...,...,...,...
4693,eBay Inc.,98.969072,3.389632e+10,2020
4694,eGain Corporation,100.000000,3.239460e+08,2020
4695,"iRhythm Technologies, Inc.",100.000000,2.218932e+09,2020
4696,iRobot Corporation,100.000000,3.336530e+09,2020


In [204]:
hypothesis3_df[hypothesis3_df.year_x == '2011'].sort_values(by='Rate', ascending=False).head(5)

Unnamed: 0,EMPLOYER_NAME,Rate,MarketCap_x,year_x
0,"ABIOMED, INC.",100.0,16202490000.0,2011
265,"NXSTAGE MEDICAL, INC.",100.0,1977201000.0,2011
129,"ENPHASE ENERGY, INC.",100.0,803840500.0,2011
274,"PACIFIC ETHANOL, INC.",100.0,52574700.0,2011
131,EPLUS INC.,100.0,1243113000.0,2011


In [205]:
hypothesis3_df[hypothesis3_df.year_x == '2011'].sort_values(by='Rate', ascending=False).tail(5)

Unnamed: 0,EMPLOYER_NAME,Rate,MarketCap_x,year_x
145,"EYEGATE PHARMACEUTICALS, INC.",50.0,19549750.0,2011
144,"EYEGATE PHARMACEUTICALS, INC.",50.0,19549750.0,2011
326,"SHOTSPOTTER, INC.",33.333333,521500000.0,2011
200,IROBOT CORPORATION,25.0,3336530000.0,2011
72,CEMTREX INC.,25.0,11194810.0,2011


In [206]:
hypothesis3_df[hypothesis3_df.year_x == '2015'].sort_values(by='Rate', ascending=False).head(5)

Unnamed: 0,EMPLOYER_NAME,Rate,MarketCap_x,year_x
1781,"CELLECTAR BIOSCIENCES, INC.",200.0,0.0,2015
1696,"2U, INC.",100.0,4011720000.0,2015
1931,"LIFETIME BRANDS, INC.",100.0,223400700.0,2015
1946,"MERRIMACK PHARMACEUTICALS, INC.",100.0,79122710.0,2015
1945,"MERIT MEDICAL SYSTEMS, INC.",100.0,3292758000.0,2015


In [208]:
hypothesis3_df[hypothesis3_df.year_x == '2017'].sort_values(by='Rate', ascending=False).tail(5)

Unnamed: 0,EMPLOYER_NAME,Rate,MarketCap_x,year_x
2858,LIBERTY MEDIA CORPORATION,25.0,13218960000.0,2017
2854,LIBERTY MEDIA CORPORATION,25.0,7091870000.0,2017
2850,LIBERTY MEDIA CORPORATION,25.0,1473893000.0,2017
2844,LATTICE SEMICONDUCTOR CORPORATION,25.0,1454566000.0,2017
2749,"ETSY, INC.",18.181818,6659031000.0,2017


In [209]:
hypothesis3_df[hypothesis3_df.year_x == '2017'].sort_values(by='Rate', ascending=False).tail(5)

Unnamed: 0,EMPLOYER_NAME,Rate,MarketCap_x,year_x
2858,LIBERTY MEDIA CORPORATION,25.0,13218960000.0,2017
2854,LIBERTY MEDIA CORPORATION,25.0,7091870000.0,2017
2850,LIBERTY MEDIA CORPORATION,25.0,1473893000.0,2017
2844,LATTICE SEMICONDUCTOR CORPORATION,25.0,1454566000.0,2017
2749,"ETSY, INC.",18.181818,6659031000.0,2017


In [None]:
hypothesis3_df[hypothesis3_df.year_x == '2011'].sort_values(by='Rate', ascending=False).head(5)