Importing the required libraries.

In [None]:
import pandas as pd
import numpy as np

Creating a function to calculate the weighted median of a given series of values and weights.

In [None]:
def weighted_median(data, weights, interpolate = True):
    """
    A function that calculates the weighted median of a given series of values 
    by using a series of weights.
    
    Parameters
    ----------
    data : Iterable
        The data which the function calculates the median for.
    weights : Iterable
        The weights the function uses to calculate an weighted median.

    Returns
    -------
    numpy.float64
        The function return the weighted median.
        
    Required libraries
    ---------
    Numpy.
    """
    #Forcing the data to a numpy array.
    data = np.array(data)
    weights = np.array(weights)
    
    #Sorting the data and the weights.
    ind_sorted = np.argsort(data)
    sorted_data = data[ind_sorted]
    sorted_weights = weights[ind_sorted]
   
    #Calculating the cumulative sum of the weights.
    sn = np.cumsum(sorted_weights)
    
    #Calculating the threshold.
    threshold = sorted_weights.sum()/2
   
    #Interpolating the median and returning it.
    if interpolate == False:
        return sorted_data[sn >= threshold][0]
            
    #Returning the first value that equals or larger than the threshold.
    else:
        return np.interp(0.5, (sn - 0.5 * sorted_weights) / np.sum(sorted_weights), sorted_data) 


Creating the lists of file names t iterate with. Swich your address in the 'base_address' variable. DON'T erase the 'r'.

In [None]:
base_address = r'C:\Users\User\Google Drive\k_data\CBS Households Expenditures Survey'
folder_names = ['\\famexp_2010', '\\famexp_2011', '\\famexp_2012', '\\famexp_2013']
mb_file_names = ['\\H20101022datamb.csv', '\\H20111021datamb.csv']
prat_file_names = ['\\H20101022dataprat.csv', '\\H20111021dataprat.csv', '\\H20121022dataprat.csv', '\\H20131021dataprat.csv']
results = pd.DataFrame(index = [2010, 2011, 2012, 2013])


The loop that calcultes each mean and median for the different years.

In [None]:
for year, folder, prat_file in zip(results.index, folder_names, prat_file_names):
    df_prat = pd.read_csv(base_address + folder + prat_file)
    if year in [2010, 2011]:
        df_mb = pd.read_csv(base_address + folder + mb_file_names[year - 2010], index_col = 'misparmb')
        df_prat['weight'] = 0
        for mb, i in zip(df_mb.index, df_prat.index): 
            df_prat.loc[i, 'weight'] = df_mb.loc[mb, 'mishkal'] # Using the household's weights for the prat file.
    
    yeshiva = df_prat[df_prat['l_school'] == 10] #Counting only individuals that their last school was a Kolel or Yeshiva.
    men_yeshiva = yeshiva[yeshiva['min'] == 1] #Counting only men.
    men_yeshiva_25_64 = yeshiva[(yeshiva['gil'] >= 25) & (yeshiva['gil'] < 65)] #Counting only ages 25-64 .
    men_yeshiva_25_64 = men_yeshiva_25_64[men_yeshiva_25_64['i111prat'] > 0] #Counting only those with wages.
    results.loc[year, 'Average'] = np.average(men_yeshiva_25_64['i111prat'], weights = men_yeshiva_25_64['weight']) #Calculating the mean.
    results.loc[year, 'Median'] = weighted_median(men_yeshiva_25_64['i111prat'], men_yeshiva_25_64['weight']) #Calculating the median.

Saving the results. Insert your address here to save the file.

In [None]:
address = r''
results.to_csv(address + '\\reuslts haredim men.csv')