In [1]:
import pandas as pd
import os
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
def read_data(file_name: str, hypothesis_flag) -> pd.DataFrame:
    """
    Load file containg details of LCA and retain only useful columns.
    Replace the column names with meaningful heads.
    :param filename: The path to the LCA data file.
    """

    col_required = ['STATUS', 'CASE_STATUS', 'LCA_CASE_EMPLOYER_NAME', 'EMPLOYER_NAME', 'TOTAL_WORKERS',
                    'TOTAL_WORKER_POSITIONS',
                    'LCA_CASE_WORKLOC1_STATE', 'VISA_CLASS', 'LCA_CASE_NUMBER', 'LCA_CASE_NAICS_CODE', 'NAICS_CODE',
                    'TOTAL WORKERS', 'WORKSITE_STATE', 'WORKSITE_STATE_1', 'CASE_NUMBER', 'NAIC_CODE']

    df = pd.read_csv(filepath_or_buffer=file_name, usecols=lambda x: x in col_required,
                     dtype={'LCA_CASE_NAICS_CODE': 'str',
                            'NAICS_CODE': 'str',
                            'NAIC_CODE': 'str',
                            },
                     low_memory=False, encoding='ISO-8859-1')

    df = df.rename(
        columns={'LCA_CASE_NUMBER': 'CASE_NUMBER', 'CASE_STATUS': 'STATUS', 'LCA_CASE_EMPLOYER_NAME': 'EMPLOYER_NAME',
                 'TOTAL_WORKER_POSITIONS': 'TOTAL_WORKERS', 'LCA_CASE_WORKLOC1_STATE': 'WORKSITE_STATE',
                 'LCA_CASE_NAICS_CODE': 'NAICS_CODE', 'WORKSITE_STATE_1': 'WORKSITE_STATE', 'NAIC_CODE': 'NAICS_CODE',
                 'TOTAL WORKERS': 'TOTAL_WORKERS'})
    if hypothesis_flag == False:
            df['STATUS'] = df["STATUS"].str.upper()
            df = df[(df['STATUS'] == 'CERTIFIED') & (df['VISA_CLASS'] == 'H-1B')]
            df['NAICS_CODE'] = df['NAICS_CODE'].str[:2]
            return (df)
    elif hypothesis_flag == True:
            df['STATUS'] = df["STATUS"].str.upper()            
            df['NAICS_CODE'] = df['NAICS_CODE'].str[:2]
            df = df[df['VISA_CLASS'] == 'H-1B']
            return (df)
        
        

In [None]:
def sector_range(row):
    if isinstance(row, list) and len(row) > 1:
        return list(range(int(row[0]), int(row[1]) + 1))
    elif isinstance(row, list):
        return row[0]

In [None]:
def read_sector_data (filename : str) -> pd.DataFrame:
    """
    Load the NAICS data file, retaining only the most useful columns & rows.
    Change the layout for a few rows to make the data inclusive of a codes.
    :param filename: The path to the NAICS code data file.
 
    """
    sector_df = pd.read_csv(filepath_or_buffer=filename,encoding='ISO-8859-1')
    sector_df["Sector"]=sector_df["Sector"].str.split("-")
    sector_df['sector_range'] = sector_df["Sector"].apply(sector_range)
    sector_codes = sector_df['sector_range'].apply(pd.Series).reset_index().melt(id_vars='index').dropna()[['index', 'value']].set_index('index')
    sector_codes_final = sector_codes.merge(sector_df['Name'], left_index=True, right_index=True, how='inner')
    sector_codes_final = sector_codes_final.rename(columns={"value": "Sector"})
    sector_codes_final['Sector']= sector_codes_final['Sector'].astype('int8')
    return(sector_codes_final)



In [None]:
def hypothesis_one_cal(year_df, sector_data_df, yy):
    year_df =year_df.merge(sector_data_df, how='left', left_on='NAICS_CODE', right_on='Sector')
    stats_df = year_df.groupby(['Name'])['TOTAL_WORKERS'].sum().astype('int32').reset_index(name=yy)
    return( stats_df)

def hypothesis_one(file_list):
    sector_df = read_sector_data ('2017_NAICS_Structure_Summary_Table.csv')
    sector_df['Sector'] = sector_df['Sector'].astype('str')
    sector_name = sector_df.Name.unique().tolist()
    plot_data_df = pd.DataFrame()
    plot_data_df['Sectors']= sector_name
    for file in file_list:
        file_name = "data_H1B/" + file
        hypothesis_flag = False
        year_data = read_data (file_name, hypothesis_flag)
        year = '20' + file[7:9]
        stats = hypothesis_one_cal(year_data,sector_df,year)
        plot_data_df = plot_data_df.merge(stats, how='left', left_on='Sectors', right_on='Name')
        del plot_data_df['Name']
    return(plot_data_df)

In [None]:
def hypothesis_two(directory):
    country = ['China - mainland', 'China - Taiwan','India', 'Korea, South', 'Mexico','Brazil', 'Australia', 'Russia','Great Britain and Northern Ireland',
                'Germany','France', 'Philippines']
    sd= pd.DataFrame({"Nationality": country})
    for file in directory:
        col_name = 'Fiscal Year 20' + file[2:4]
        file_name = "data_Country/" + file
        df = pd.read_csv(filepath_or_buffer=file_name,thousands=',', dtype= {'H-1B':'float'})
        df=df.rename(columns={'Unnamed: 0': 'Visa_Country', col_name: 'Visa_Country'})
        df= df[['Visa_Country','H-1B']]
        df= df.rename(columns ={'H-1B': col_name})
        sd =sd.merge(df, how='left',  left_on= 'Nationality', right_on='Visa_Country')
        del sd['Visa_Country']
    return (sd)

In [None]:
def df_hypothesis_three(file_list):
    company_df = pd.read_csv("companylist.csv", dtype={'MarketCap': 'float64'})
    for file in file_list:
        file_name = "data_H1B/" + file
        hypothesis_flag = True
        year_data = read_data (file_name, hypothesis_flag)
        year = '20' + file[7:9]
        company_df['name_lower'] = company_df['Name'].str.lower()
        year_data['EMPLOYER_NAME_lower'] = year_data['EMPLOYER_NAME'].str.lower()
        companylist_merged_df = year_data.merge(company_df, left_on='EMPLOYER_NAME_lower', right_on='name_lower')
        companylist_merged_df_new = pd.DataFrame()
        companylist_merged_df_new = companylist_merged_df[companylist_merged_df['STATUS'] == 'CERTIFIED'].groupby(['EMPLOYER_NAME','MarketCap'])['TOTAL_WORKERS'].sum().reset_index(name="CERTIFIED_TOTAL_WORKERS")
        companylist_merged_df_new['year'] = year
        #companylist_merged_df_new['TOTAL_WORKERS'] = companylist_merged_df_new['year']
        companylist_merged_df_new.sort_values(by='MarketCap', ascending=False)
        print(companylist_merged_df_new.sort_values(by='CERTIFIED_TOTAL_WORKERS', ascending=False))
   

In [None]:
def hypothesis_two_state_cal(states_data_df, year_df, yy):
    year_df =year_df.merge(states_data_df, how='left', left_on='WORKSITE_STATE', right_on='Abbreviation')
    stats_df = year_df.groupby(['WORKSITE_STATE'])['TOTAL_WORKERS'].sum().reset_index(name=yy)
    return( stats_df)

def hypothesis_two_state(file_list):
    states_df = pd.read_csv('states.csv')
    state_name = states_df.State.unique().tolist()
    plot_data_df = pd.DataFrame()
    plot_data_df['Worksite State']= state_name
    for file in file_list:
        file_name = "data_H1B/" + file
        year_data = read_data (file_name)
        year = '20' + file[7:9]
        stats = hypothesis_two_state_cal(states_df,year_data, year)
        plot_data_df = plot_data_df.merge(stats, how='left', left_on='Worksite State', right_on='State')
        del plot_data_df['State']
    return(plot_data_df)

In [None]:
if __name__ == '__main__':
    path = "data_H1B/"
    directory = os.listdir(path)
    path2 = "data_Country/"
    directory2 =os.listdir(path2)
    #df = hypothesis_one(directory)
    sd= hypothesis_two(directory2)
    #df = hypothesis_two_state(directory)
    #df = hypothesis_one(directory)
    df_hypothesis_three(directory)    
   