Importing the required libraries.

In [16]:
import pandas as pd
import numpy as np

Creating a function to calculate weighted median

In [17]:
def weighted_median(data, weights, interpolate = False):
    """
    A function that calculates the weighted median of a given series of values 
    by using a series of weights.
    
    Parameters
    ----------
    data : Iterable
        The data which the function calculates the median for.
    weights : Iterable
        The weights the function uses to calculate an weighted median.

    Returns
    -------
    numpy.float64
        The function return the weighted median.
        
    Required libraries
    ---------
    Numpy.
    """
    #Forcing the data to a numpy array.
    data = np.array(data)
    weights = np.array(weights)
    
    #Sorting the data and the weights.
    ind_sorted = np.argsort(data)
    sorted_data = data[ind_sorted]
    sorted_weights = weights[ind_sorted]
   
    #Calculating the cumulative sum of the weights.
    sn = np.cumsum(sorted_weights)
    
    #Calculating the threshold.
    threshold = sorted_weights.sum()/2
   
    #Interpolating the median and returning it.
    if interpolate == False:
        return sorted_data[sn >= threshold][0]
    
    #Returning the first value that equals or larger than the threshold.
    else:
        return np.interp(0.5, (sn - 0.5 * sorted_weights) / np.sum(sorted_weights), sorted_data)
    

Creating a function to calculate standard persons according to the National Insurance Institue criteria.

In [18]:
def nefesh_btl(nefesh):
    """
    Parameters
    ----------
    nefesh : int
        The number of persons the household has.

    Returns
    -------
    Float
        The standardised number of persons in the household, 
        according to National Security Institue and the Central Bureau of Statistics definition.
    
    Required libraries
    ------------------
    None.

    """
    l = [1.25, 2, 2.65, 3.2, 3.75, 4.25, 4.75, 5.2]
    if nefesh <= len(l) - 1:
        return l[int(nefesh - 1)]
    else:
        return 5.6 + (nefesh - 9) * 0.4
    

Creating the main function for the calculations. The function get a DataFrame and a year iterator to calculate the results, and saves them to a different DataFrame.

In [19]:
def calc_poverty(df, results, year):
    """
    Parameters
    ----------
    df : DataFrame
        The DataFrame that the function uses to calculate poverty.
    results : DataFrame
        The DataFrame that the function saves the calculations to.
    year : int
        A iterator for the different survey years.

    Returns
    -------
    DataFrame
        The DataFrame which contains the results of the calculations.
    
    Required libraries
    ------------------
    Pandas,
    Numpy.

    """
    """
    Calculating average net income per household and average net income per standard person per household.
    """
        
    results.loc[str(year), 'mean_net'] = np.average(df['net'], weights = df['weight'])
    results.loc[str(year), 'mean_net_to_nefesh'] = np.average(df['net']/df['nefashot'], weights = df['weight'])
        
    """
    Calculating the four different types of income that Ariel wanted, 
    and calculating the number of persons each household represent in the general population.
    """
        
    df.loc[:, 'net_to_nefesh'] = df.loc[:, 'net'] / df.loc[:, 'nefeshstandartit']
    df.loc[:, 'bruto_to_nefesh'] = df.loc[:, 'i1kaspit'] / df.loc[:, 'nefeshstandartit']
    df.loc[:, 'total_bruto_to_nefesh'] = (df.loc[:, 'i1kaspit'] + df.loc[:, 'iinkind']) / df.loc[:, 'nefeshstandartit']
    df.loc[:, 'total_net_to_nefesh'] = df.loc[:, 'total_net'] / df.loc[:, 'nefeshstandartit']
    df.loc[:, 'weight_nefesh'] = df.loc[:, 'weight'] * df.loc[:, 'nefashot']
        
    """
    Calculating the thresholds for said types of income.
    """
        
    oni_t = {
            'net_to_nefesh': weighted_median(df['net_to_nefesh'], df['weight']) / 2,
            'bruto_to_nefesh': weighted_median(df['bruto_to_nefesh'], df['weight']) / 2,
            'total_bruto_to_nefesh': weighted_median(df['total_bruto_to_nefesh'], df['weight']) / 2,
            'total_net_to_nefesh': weighted_median(df['total_net_to_nefesh'], df['weight']) / 2
            }
        
    """
    A simple loop that save the threshold to the results DataFrame, 
    and calculate each type of poverty ratio of both households and persons.
    """
        
    for t in threshold_list:
        results.loc[str(year), 'oni_threshold_' + t] =  oni_t[t]
        results.loc[str(year), 'oni_hb_' + t] = df[df[t] < oni_t[t]]['weight'].sum() / df['weight'].sum()
        results.loc[str(year), 'oni_nefashot_' + t] = df[df[t] < oni_t[t]]['weight_nefesh'].sum() / df['weight_nefesh'].sum()
    return results.loc[str(year), :]

Creating variables for iterating the different surveys. Switch the base_address variable to your, and DON'T erase the 'r' before the address.

In [20]:
base_address = r'C:\Users\dtsj8\Google Drive (tsadeh@kohelet.org.il)\k_data\CBS Households Expenditures Survey\\'
        
file_names_exp = ['H20141022datamb', 'H20151021datamb', 'h20161023datamb', 'H20171022datamb', 'H20181022datamb']
folders = ['famexp_2014\\', 'famexp_2015\\', 'famexp_2016_new\\', 'famexp_2017_new\\', 'famexp_2018_new\\']

oni_type_list = ['oni_threshold_','oni_hb', 'oni_nefashot']
threshold_list = ['net_to_nefesh','bruto_to_nefesh','total_bruto_to_nefesh','total_net_to_nefesh']

results_lo_haredim = pd.DataFrame(index = map(str, list(np.arange(2014, 2019))))
results_arabs = pd.DataFrame(index = map(str, list(np.arange(2014, 2019))))
results_haredim = pd.DataFrame(index = map(str, list(np.arange(2014, 2019))))

The main loop, which imports the relevant survey and then makes the calculations on the three different populations.

In [21]:
for year, file_name, folder in zip(np.arange(2014, 2019), file_names_exp, folders):
        df = pd.read_csv(base_address + folder  + file_name + '.csv')
        
        #Not Haredi Jews DataFrame.
        df_lo_haredim = df[(df['Nationality'] == 1) & (df['RamatDatiyut'] != 4)]
        
        #Haredi Jews DataFrame.
        df_haredim = df[(df['Nationality'] == 1) & (df['RamatDatiyut'] == 4)]
        
        #Arabs DataFrame.
        df_arabs = df[(df['Nationality'] == 2)]
        
        #
        results_lo_haredim.loc[str(year), :] = calc_poverty(df_lo_haredim, results_lo_haredim, year)
        results_haredim.loc[str(year), :] = calc_poverty(df_haredim, results_haredim, year)
        results_arabs.loc[str(year), :] = calc_poverty(df_arabs, results_arabs, year)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


Exporting the results. switch the address to yours and DON'T delete the 'r' before the address.

In [22]:
results_folder = r'C:\Users\dtsj8\OneDrive\Documents\Work\Poverty Calculations\\'
results_lo_haredim.to_csv(results_folder + 'lo haredim.csv')
results_haredim.to_csv(results_folder + 'haredim.csv')
results_arabs.to_csv(results_folder + 'arabs.csv')