# Below Minimum Wage

## Intro

This script calculates and shows with histograms the hourly wages in different Sectors in the Israeli economy, according to the CBS Households Expenditures 2018 Survey data. 

## The Code

### Calculations

Firstly, we import the requaired libraries.

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

Then we define some basic functions to invert strings and calculate weighted median.

In [None]:
def weighted_median(data, weights, interpolate = False):
    """
    A function that calculates the weighted median of a given series of values 
    by using a series of weights.
    
    Parameters
    ----------
    data : Iterable
        The data which the function calculates the median for.
    weights : Iterable
        The weights the function uses to calculate an weighted median.
    interpolate : bool
        A boolean argument for interpolating the median, if necessary.
        The default value is False.
        
    Returns
    -------
    numpy.float64
        The function return the weighted median.
        
    Required libraries
    ---------
    Numpy.
    """
    #Forcing the data to a numpy array.
    data = np.array(data)
    weights = np.array(weights)
    
    #Sorting the data and the weights.
    ind_sorted = np.argsort(data)
    sorted_data = data[ind_sorted]
    sorted_weights = weights[ind_sorted]
   
    #Calculating the cumulative sum of the weights.
    sn = np.cumsum(sorted_weights)
    
    #Calculating the threshold.
    threshold = sorted_weights.sum()/2
   
    #Interpolating the median and returning it.
    if interpolate:
        return np.interp(0.5, (sn - 0.5 * sorted_weights) / np.sum(sorted_weights), sorted_data)
    
    #Returning the first value that equals or larger than the threshold.
    else:
        return sorted_data[sn >= threshold][0]

def invert(string):
    """
    A function which invert a string.
    Parameters
    ----------
    string : string
        the string to invert.

    Returns
    -------
    string
        An inverted string.

    Required libraries
    ------------------
    None.
    """
    return string[::-1]

After that we import the data. Enter the file address in the base_address variable and don't erase the 'r'.

In [None]:
base_address = r''
prat = pd.read_csv(base_address + '\\H20181023dataprat.csv')
anaf_names = pd.read_csv(base_address + '\\anaf_names.csv', index_col = 'Code')

We then replace the '\\\n' default string with '\n', in order to go down to a new line in the titles of the figures correctly, and creating an empty DataFrame to contain the analysis results.

In [None]:
anaf_names = anaf_names.apply(lambda x: x.str.replace(r'\\n', '\\n'))
results = pd.DataFrame()

We are dropping the nan's, and calculating hourly wages, by multipling the weekly hours by the weeks worked each month, and the dividing the gross monthly wages by that value. 

In [None]:
prat = prat[prat['sh_shavua'].notna()].copy()
prat['sh_hodesh'] = prat['sh_shavua'] * prat['shavuot']
prat['i111 to hour'] = prat['i111prat'] / prat['sh_hodesh']

Then we create a boolean series and of workers that earn below minimum wage and below 10 NIS per hour, respectively.

In [None]:
prat['below min'] = prat['i111 to hour'] < 29.12
prat['really below min'] = prat['i111 to hour'] < 10

Then we create the main loop of the calculations.

In [None]:
for anaf in prat['anaf1'].unique():
    # Grouping the workers by Economic Section.
    grouped = prat.groupby('anaf1').get_group(anaf)
    # Calculating the workers that earn below minimum wage and below 10 NIS per hour, respectively.
    results.loc[anaf, 'below min'] = grouped[grouped['below min']]['weight'].sum() / grouped['weight'].sum()
    results.loc[anaf, 'really below min'] = grouped[grouped['really below min']]['weight'].sum() / grouped['weight'].sum()
    # Calculating median and average hourly wages.
    results.loc[anaf, 'median hourly wage'] = weighted_median(grouped['i111 to hour'], weights = grouped['weight'])
    results.loc[anaf, 'average hourly wage'] = np.average(grouped['i111 to hour'], weights = grouped['weight'])
    # Recording the size of the Section by the number of employees.
    results.loc[anaf, 'weight'] = grouped['weight'].sum() 

### The Histogarms

First, we create a dictionary for the three figures labels and a DataFrame ilustrating the shape.

In [None]:
labels_dict = dict(shah = ' ש"ח לשעה',
                   average = 'ממוצע - ',
                   median = 'חציון - ',
                   minimum = 'שכר מינימום - ',
                   title = 'התפלגות השכר השעתי ב-8102, לפי ענפים',
                   percent_text = 'מתחת ל-21.92',
                   percent_really_text = ' מתחת ל-01',
                   xtitle = 'שכר ברוטו לשעה בש"ח',
                   ytitle = 'צפיפות')
                   
fig_rows = 7
xticks_loc = list(np.arange(0, 250, step = 50))
fig_shape = pd.DataFrame(columns = ['First', 'Second', 'Third'], 
                         index = np.arange(1,8), 
                         data = [['A','I','P'],
                                 ['B','J','Q'],
                                 ['D','K','R'],
                                 ['E','L','S'],
                                 ['F','M','T'],
                                 ['G','N','U'],
                                 ['H','O','X']])
font_sizes = dict(small = 12,
                  medium = 14,
                  medium_l = 25,
                  large = 40)

Then we control the general font size and family.

In [None]:
plt.rc('font', size = font_sizes['small'])          # Controls default text sizes
plt.rc('axes', titlesize = font_sizes['medium'])    # Fontsize of the axes title
plt.rc('axes', labelsize = font_sizes['medium'])    # Fontsize of the x and y labels
plt.rc('xtick', labelsize = font_sizes['small'])    # Fontsize of the tick labels
plt.rc('ytick', labelsize = font_sizes['small'])    # Fontsize of the tick labels
plt.rc('legend', fontsize = font_sizes['small'])    # Legend fontsize
plt.rc('figure', titlesize = font_sizes['large'])   # Fontsize of the figure title
plt.rc('font', family = 'Alef')                     # Font family

Then it's time for the main loop of the histograms creation.

In [None]:
for anaf_column in fig_shape.columns:
    fig, axs = plt.subplots(nrows = fig_rows, ncols = 1,
                            sharey = True,
                            figsize = (7,21))
    fig.dpi = 500
    fig.suptitle(invert(labels_dict['title']), y = 0.98, x = 0.7)
    
    # The for loop for each individual histogram in the figure.
    for row, anaf_row in zip(range(0, fig_rows), fig_shape.index):
        anaf = fig_shape.loc[anaf_row, anaf_column]
        
        # Filtering each Section and setting the specific histogram title.
        anaf_df = prat[prat['anaf1'] == anaf]
        axs[row].set_title(invert(anaf_names.loc[anaf, 'Name']))
        
        # Drawing the histogram with bins of 3 NIS/hour, probabilty density and the weights of the survey.
        axs[row].hist(anaf_df['i111 to hour'], bins = np.arange(0, 200, step = 3), density = True, weights = anaf_df['weight'], label = invert('שכר ברוטו לשעה'))
        
        # Drawing vertical lines for the average, median and the hourly minimum wage on the histogram.
        axs[row].axvline(np.average(anaf_df['i111 to hour'], weights = anaf_df['weight']), color = 'black', label = invert(labels_dict['shah']) + invert(labels_dict['average']))
        axs[row].axvline(weighted_median(anaf_df['i111 to hour'], anaf_df['weight']), color = 'black', linestyle = '--', label = invert(labels_dict['shah']) + invert(labels_dict['median']))
        axs[row].axvline(29.12, color = 'black', linestyle = ':', label = invert(labels_dict['shah']) + '29.12' + invert(labels_dict['minimum']))
        
        # Adding the box outside and to the right of the histograms.
        percent_below_min = str(np.round(results.loc[anaf, 'below min'] * 100, 2))
        percent_really_below_min = str(np.round(results.loc[anaf, 'really below min'] * 100, 2))
        axs[row].annotate('          ' +
                          percent_below_min + 
                          '%' +
                          '\n' +
                          invert(labels_dict['shah']) + 
                          invert(labels_dict['percent_text']),
                          xy = (450, 50),
                          xycoords = 'axes points', 
                          annotation_clip = False, 
                          bbox = dict(boxstyle = "round4", fc = "w"))
        axs[row].annotate('          ' +
                          percent_really_below_min + 
                          '%' +
                          '\n' +
                          invert(labels_dict['shah']) + 
                          invert(labels_dict['percent_really_text']),
                          xy = (450, 10),
                          xycoords = 'axes points', 
                          annotation_clip = False, 
                          bbox = dict(boxstyle = "round4", fc = "w"))
        
        # Setting the labels and ticks of the x axis.
        axs[row].set_xticks(xticks_loc)
        axs[row].set_xticklabels(map(str, xticks_loc))
    
    # Creating the legend and adjusting the whitespace of the figure.
    handles, labels = axs[row].get_legend_handles_labels()
    plt.subplots_adjust(hspace = 1, wspace = 1)
    fig.legend(handles, labels, loc = 'upper center', bbox_to_anchor = (0.7, 0.95))
    
    # Adding the x and y labels.
    fig.text(0.5, 0.1, invert(labels_dict['xtitle']), ha = 'center', fontsize = font_sizes['medium_l'])
    fig.text(-0.04, 0.5, invert(labels_dict['ytitle']), va = 'center', rotation = 'vertical', fontsize = font_sizes['medium_l'])
    
    # Saving the figure. Enter your address here and don't erase the 'r'.
    fig_base_address = r''
    plt.savefig(fig_base_address + '\\' + str(anaf_column) + '.png', bbox_inches = 'tight')
    plt.show()
    plt.close()


Finally we sort the results DataFrame and exporting the results to a csv file. Enter your prefered address and don't erase the 'r'.   

In [None]:
csv_base_address = r''
results.sort_index(inplace = True)
results.to_csv(csv_base_address + '\\results_below_min.csv')