In [1]:
import pandas as pd
import numpy as np

In [13]:
import pandas as pd

def coarsen_data(data, coarsening_rule):
  """Coarsens the data according to the given coarsening rule.

  Args:
    data: The data to be coarsened.
    coarsening_rule: A dictionary that maps variable names to lists of values that should be grouped together.

  Returns:
    The coarsened data.
  """


  for variable, values in coarsening_rule.items():
    data[variable] = data[variable].replace(values, 1)

  return data

def match_data(data, treated_index, control_index):
  """Matches the data according to the given treatment and control indexes.

  Args:
    data: The data to be matched.
    treated_index: The index of the treated observations.
    control_index: The index of the control observations.

  Returns:
    A list of matched pairs.
  """

  matches = []
  for treated_id in treated_index:
    closest_control_id = min(control_index, key=lambda control_id: distance(data.loc[treated_id], data.loc[control_id]))
    matches.append((treated_id, closest_control_id))

  return matches

def distance(x, y):
  """Computes the distance between two data points.

  Args:
    x: The first data point.
    y: The second data point.

  Returns:
    The distance between x and y.
  """

  distance = 0
  for variable in x.columns:
    distance += (x[variable] - y[variable])**2

  return np.sqrt(distance)


def get_strata_and_weights(data, treatment, control, coarsening_rule):
  """Gets the strata number and weights for each matched datapoint.

  Args:
    data: The data to be matched.
    treatment: The name of the treatment variable.
    control: The name of the control variable.
    coarsening_rule: A dictionary that maps variable names to lists of values that should be grouped together.

  Returns:
    A pandas DataFrame with the following columns:
      * strata_number: The strata number for each matched datapoint.
      * weight: The weight for each matched datapoint.
  """

  # Coarsen the data.
  coarsened_data = coarsen_data(data, coarsening_rule)

  # Get the indexes of the treated and control observations.
  treated_index = coarsened_data[treatment] == 1
  control_index = coarsened_data[control] == 0

  # Match the data.
  matches = match_data(coarsened_data, treated_index, control_index)

  # Compute the strata number and weight for each matched datapoint.
  strata_numbers = []
  weights = []
  for treated_id, control_id in matches:
    strata_number = coarsened_data.loc[treated_id].name
    weight = 1 / distance(coarsened_data.loc[treated_id], coarsened_data.loc[control_id])
    strata_numbers.append(strata_number)
    weights.append(weight)

  # Create a pandas DataFrame with the strata number and weight for each matched datapoint.
  df = pd.DataFrame({"strata_number": strata_numbers, "weight": weights})

  return df


def cem(data, treatment, control, coarsening_rule):
  """Performs Coarsened Exact Matching on the given data.

  Args:
    data: The data to be matched.
    treatment: The name of the treatment variable.
    control: The name of the control variable.
    coarsening_rule: A dictionary that maps variable names to lists of values that should be grouped together.

  Returns:
    A list of matched pairs.
  """

  # Coarsen the data.
  coarsened_data = coarsen_data(data, coarsening_rule)

  # Get the indexes of the treated and control observations.
  treated_index = coarsened_data[treatment] == 1
  control_index = coarsened_data[control] == 0

  # Match the data.
  matches = match_data(coarsened_data, treated_index, control_index)

  return matches



KeyError: 'True: boolean label can not be used without a boolean index'

In [16]:
def coarsen_data_cont(df, thresholds,column):
    thresholds.sort()
  
    for i in range(len(thresholds)):
        threshold = thresholds[i]
        label = i + 1
        df.loc[(df[column] <= threshold) & (df[column] > thresholds[i - 1] if i > 0 else True), column] = label
        if i == len(thresholds) - 1:
            df.loc[df[column] > threshold, column] = len(thresholds) + 1
        
    return df

data = pd.DataFrame()
data['age'] = [10,12,15,18,20,30,35]
data['name'] = ['s1','s2','s3','s1','s1','s1','s2']
new = coarsen_data_cont(data,[10,20,30],'age')

new['name'] = new['name'].astype('category')

print(new)
print(new.loc[5].name)

   age name
0    1   s1
1    2   s2
2    2   s3
3    2   s1
4    2   s1
5    3   s1
6    4   s2
5


In [None]:
import pandas as pd


def coarsen_data_cont(df, thresholds,column):
    thresholds.sort()
  
    for i in range(len(thresholds)):
        threshold = thresholds[i]
        label = i + 1
        df.loc[(df[column] <= threshold) & (df[column] > thresholds[i - 1] if i > 0 else True), column] = label
        if i == len(thresholds) - 1:
            df.loc[df[column] > threshold, column] = len(thresholds) + 1
        
    return df

def coarsen_data(data, coarsening_rule):
  """Coarsens the data according to the given coarsening rule.

  Args:
    data: The data to be coarsened.
    coarsening_rule: A dictionary that maps variable names to lists of values that should be grouped together.

  Returns:
    The coarsened data.
  """


  for variable, values in coarsening_rule.items():
    data[variable] = data[variable].replace(values, 1)

  return data

def match_data(data, treated_index, control_index):
  """Matches the data according to the given treatment and control indexes.

  Args:
    data: The data to be matched.
    treated_index: The index of the treated observations.
    control_index: The index of the control observations.

  Returns:
    A list of matched pairs.
  """

  matches = []
  for treated_id in treated_index:
    closest_control_id = min(control_index, key=lambda control_id: distance(data.loc[treated_id], data.loc[control_id]))
    matches.append((treated_id, closest_control_id))

  return matches

def distance(x, y):
  """Computes the distance between two data points.

  Args:
    x: The first data point.
    y: The second data point.

  Returns:
    The distance between x and y.
  """

  distance = 0
  for variable in x.columns:
    distance += (x[variable] - y[variable])**2

  return np.sqrt(distance)


def get_strata_and_weights(data, treatment, coarsening_rule):
  """Gets the strata number and weights for each matched datapoint.

  Args:
    data: The data to be matched.
    treatment: The name of the treatment variable.
    control: The name of the control variable.
    coarsening_rule: A dictionary that maps variable names to lists of values that should be grouped together.

  Returns:
    A pandas DataFrame with the following columns:
      * strata_number: The strata number for each matched datapoint.
      * weight: The weight for each matched datapoint.
  """

  # Coarsen the data.
  coarsened_data = coarsen_data(data, coarsening_rule)

  # Get the indexes of the treated and control observations.
  treated_index = coarsened_data[treatment] == 1
  control_index = coarsened_data[treatment] == 0

  # Match the data.
  matches = match_data(coarsened_data, treated_index, control_index)

  # Compute the strata number and weight for each matched datapoint.
  strata_numbers = []
  weights = []
  for treated_id, control_id in matches:
    strata_number = coarsened_data.loc[treated_id].name
    weight = 1 / distance(coarsened_data.loc[treated_id], coarsened_data.loc[control_id])
    strata_numbers.append(strata_number)
    weights.append(weight)

  # Create a pandas DataFrame with the strata number and weight for each matched datapoint.
  df = pd.DataFrame({"strata_number": strata_numbers, "weight": weights})

  return df

In [3]:
import pandas as pd


def coarsen_data_cont(df, thresholds,column):
    thresholds.sort()
  
    for i in range(len(thresholds)):
        threshold = thresholds[i]
        label = i + 1
        df.loc[(df[column] <= threshold) & (df[column] > thresholds[i - 1] if i > 0 else True), column] = label
        if i == len(thresholds) - 1:
            df.loc[df[column] > threshold, column] = len(thresholds) + 1
        
    return df

def match_data(data, treated_index, control_index, minval):
  """Matches the data according to the given treatment and control indexes.

  Args:
    data: The data to be matched.
    treated_index: The index of the treated observations.
    control_index: The index of the control observations.

  Returns:
    A list of matched pairs.
  """

  matches = []
  for treated_id in treated_index:
    closest_control_id = min(control_index, key=lambda control_id: distance(data.loc[treated_id], data.loc[control_id]))
    if distance(data.loc[treated_id],data.loc[closest_control_id]) < minval:
        matches.append((treated_id, closest_control_id))

  return matches

def distance(x, y): 
  """Computes the distance between two data points.

  Args:
    x: The first data point.
    y: The second data point.

  Returns:
    The distance between x and y.
  """

  distance = 0
  for variable in x.columns:
    distance += (x[variable] - y[variable])**2

  return np.sqrt(distance)


def get_strata_and_weights(data, treatment):
    
  """Gets the strata number and weights for each matched datapoint.

  Args:
    data: The data to be matched.
    treatment: The name of the treatment variable.
    control: The name of the control variable.
    coarsening_rule: A dictionary that maps variable names to lists of values that should be grouped together.

  Returns:
    A pandas DataFrame with the following columns:
      * strata_number: The strata number for each matched datapoint.
      * weight: The weight for each matched datapoint.
  """

  # Coarsen the data.
   coarsened_data = coarsen_data_cont(data, [10,20,30],'age')

  # Get the indexes of the treated and control observations.
   treated_index = coarsened_data[treatment] == 1
 
   control_index = coarsened_data[treatment] == 0

  # Match the data.
   matches = match_data(coarsened_data, treated_index, control_index, minval = 1)

  # Compute the strata number and weight for each matched datapoint.
   strata_numbers = []
   weights = []
   for treated_id, control_id in matches:
     strata_number = coarsened_data.loc[treated_id].name
    weight = 1 / distance(coarsened_data.loc[treated_id], coarsened_data.loc[control_id])
    strata_numbers.append(strata_number)
    weights.append(weight)

  # Create a pandas DataFrame with the strata number and weight for each matched datapoint.
  df = pd.DataFrame({"strata_number": strata_numbers, "weight": weights})

  return df


data = pd.DataFrame()
data['age'] = [10,12,15,18,20,30,35]
data['name'] = [0,1,2,0,0,0,1]
data['treatment'] = [0,1,1,0,1,1,1]

d = get_strata_and_weights(data, 'treatment')

KeyError: 'False: boolean label can not be used without a boolean index'

In [4]:
a = pd.DataFrame()
a['name'] = [1,2,1,4,5,6]
a['age'] = [10,12,10,14,15,16]
a['class'] = [1,2,1,1,5,6]
print(a)

   name  age  class
0     1   10      1
1     2   12      2
2     1   10      1
3     4   14      1
4     5   15      5
5     6   16      6


In [11]:
b = a.drop_duplicates()
b

Unnamed: 0,name,age,class
0,1,10,1
1,2,12,2
3,4,14,1
4,5,15,5
5,6,16,6


In [17]:
from collections import defaultdict
mydict = defaultdict(list)
cnt = 1
for i, tup in enumerate(b.iterrows()):
    _, x_row = tup
    #print(x_row)
    mydict[tuple(x_row)] = cnt
    cnt += 1

print(mydict)

defaultdict(<class 'list'>, {(1, 10, 1): 1, (2, 12, 2): 2, (4, 14, 1): 3, (5, 15, 5): 4, (6, 16, 6): 5})


In [22]:
c = pd.DataFrame()
c['name'] = [1,2,1,4,5,6,4]
c['age'] = [10,12,10,14,15,16,19]
c['class'] = [1,2,1,1,5,6,12]
#c['gender'] = [1,0,0,1,1,1]
#print(c)
for i, tup in enumerate(c.iterrows()):
    _, x_row = tup
    if tuple(x_row) in mydict:
        print(mydict[tuple(x_row)])
    else:
        print("none")

1
2
1
3
4
5
none


In [29]:
c = pd.DataFrame()
c['name'] = [1,2,1,4,5,6,4]
c['age'] = [10,12,10,14,15,16,19]
c['class'] = [1,2,1,1,5,6,12]
c['gender'] = [1,0,0,1,1,1,0]
print(c)
d = c[['name','age']]
print(d)

   name  age  class  gender
0     1   10      1       1
1     2   12      2       0
2     1   10      1       0
3     4   14      1       1
4     5   15      5       1
5     6   16      6       1
6     4   19     12       0
   name  age
0     1   10
1     2   12
2     1   10
3     4   14
4     5   15
5     6   16
6     4   19


In [35]:
def coarsen_data_cont(df, thresholds,column):
    thresholds.sort()
  
    for i in range(len(thresholds)):
        threshold = thresholds[i]
        label = i + 1
        df.loc[(df[column] <= threshold) & (df[column] > thresholds[i - 1] if i > 0 else True), column] = label
        if i == len(thresholds) - 1:
            df.loc[df[column] > threshold, column] = len(thresholds) + 1
        
    return df

def match_data(data,treatment,control_var,thresholds1,column1):
    newdf = coarsen_data_cont(data, thresholds1,column1) ##coarsen variable
    
    ##get unique control tuples
    con_data = data[control_var] #control_var is a list of control variables, other than treatment,doi and gender
    un_con = con_data.drop_duplicates()
    
    ##create map of strata for unique control tuples
    mydict = defaultdict(list)
    cnt = 1
    for i, tup in enumerate(un_con.iterrows()):
        _, x_row = tup
        #print(x_row)
        mydict[tuple(x_row)] = cnt
        cnt += 1
    
    
    new_data = data[control_var]
    strata = []
    
    for i, tup in enumerate(new_data.iterrows()):
        _, x_row = tup
        if tuple(x_row) in mydict:
            strata.append(mydict[tuple(x_row)])
        else:
            strata.append(-1)
    data['strata'] = strata
    
    s = data['strata'].unique()
    
    new = data.copy()
    
    for i in s:
        x = new[new['strata'] == i]
        t1 = x[x[treatment] == 1]
        t2 = x[x[treatment] == 0]
        
        if len(t1) == 0 or len(t2) == 0:
            new = new[new['strata'] != i]
    
    return new


c = pd.DataFrame()
c['name'] = [1,2,1,4,5,6,4]
c['age'] = [10,15,10,24,35,41,8]
c['class'] = [1,2,1,1,5,6,12]
c['gender'] = [1,0,0,1,1,1,0]

new = match_data(data = c,treatment="gender",control_var=['name','age','class'],thresholds1=[10,20,30],column1='age')

In [36]:
print(new)

   name  age  class  gender  strata
0     1    1      1       1       1
2     1    1      1       0       1


In [6]:
from collections import defaultdict
def coarsen_data_cont(df, thresholds,column, flag):
    if flag == 1:
        return df
    thresholds.sort()
  
    for i in range(len(thresholds)):
        threshold = thresholds[i]
        label = i + 1
        df.loc[(df[column] <= threshold) & (df[column] > thresholds[i - 1] if i > 0 else True), column] = label
        if i == len(thresholds) - 1:
            df.loc[df[column] > threshold, column] = len(thresholds) + 1
        
    return df

def match_data(data,treatment,control_var,thresholds1,column1):
    newdf = coarsen_data_cont(data, thresholds1,column1, flag = 1) ##coarsen variable
    
    ##get unique control tuples
    con_data = data[control_var] #control_var is a list of control variables, other than treatment,doi and gender
    un_con = con_data.drop_duplicates()
    
    ##create map of strata for unique control tuples
    mydict = defaultdict(list)
    cnt = 1
    for i, tup in enumerate(un_con.iterrows()):
        _, x_row = tup
        #print(x_row)
        mydict[tuple(x_row)] = cnt
        cnt += 1
    
    
    new_data = data[control_var]
    strata = []
    
    for i, tup in enumerate(new_data.iterrows()):
        _, x_row = tup
        if tuple(x_row) in mydict:
            strata.append(mydict[tuple(x_row)])
        else:
            strata.append(-1)
    data['strata'] = strata
    
    s = data['strata'].unique()
    
    new = data.copy()
    
    '''for i in s:
        x = new[new['strata'] == i]
        t1 = x[x[treatment] == 1]
        t2 = x[x[treatment] == 0]
        
        if len(t1) == 0 or len(t2) == 0:
            new = new[new['strata'] != i]'''
    
    for ind in new.index:
        st = new['strata'][ind]
        x = new[new['strata'] == st]
        t1 = x[x[treatment] == 1]
        t2 = x[x[treatment] == 0]
        
        if len(t1) == 0 or len(t2) == 0:
            new['strata'][ind] = -1
    
    weight = []
    for ind in new.index:
        s = new['strata'][ind]
        if new['strata'][ind] == -1:
            weight.append(0)
        else:
            if new[treatment][ind] == 1:
                weight.append(1)
            else:
                ns = new[new['strata'] == s]
                mts = ns[ns[treatment] == 1]
                mcs = ns[ns[treatment] == 0]
                
                t = new[new['strata'] != -1]
                
                mt = t[t[treatment] == 1]
                mc = t[t[treatment] == 0]
                
                weight.append((len(mc)*len(mts))/(len(mt)*len(mcs)))
    new['weight'] = weight
    return new


tst = pd.DataFrame()
tst['age'] = [5,5,5,10,10,10,15,20,25,30,100,200,5,5]
tst['dummy'] = [0,0,1,1,1,1,0,1,0,1,0,1,0,1]
tst['name'] = ['s','s','s','m','s','m','a','a','b','b','e','f','x','x']
#tst['name'] = [9,9,9,10,9,10,11,11,12,12,14,15]
tst['class'] = [1,2,3,4,5,6,7,8,9,10,40, 80,90,100]

new = match_data(data = tst,treatment="dummy",control_var=['name','age'],thresholds1=[10,20,30],column1='age')
print(new)

    age  dummy name  class  strata  weight
0     5      0    s      1       1    0.75
1     5      0    s      2       1    0.75
2     5      1    s      3       1    1.00
3    10      1    m      4      -1    0.00
4    10      1    s      5      -1    0.00
5    10      1    m      6      -1    0.00
6    15      0    a      7      -1    0.00
7    20      1    a      8      -1    0.00
8    25      0    b      9      -1    0.00
9    30      1    b     10      -1    0.00
10  100      0    e     40      -1    0.00
11  200      1    f     80      -1    0.00
12    5      0    x     90      10    1.50
13    5      1    x    100      10    1.00


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new['strata'][ind] = -1


In [3]:
from collections import defaultdict


def match_data(data,treatment,control_var):

    
    ##get unique control tuples
    con_data = data[control_var] #control_var is a list of control variables, other than treatment,doi and gender
    un_con = con_data.drop_duplicates()
    
    ##create map of strata for unique control tuples
    mydict = defaultdict(list)
    cnt = 1
    for i, tup in enumerate(un_con.iterrows()):
        _, x_row = tup
        #print(x_row)
        mydict[tuple(x_row)] = cnt
        cnt += 1
    
    
    new_data = data[control_var]
    strata = []
    
    for i, tup in enumerate(new_data.iterrows()):
        _, x_row = tup
        if tuple(x_row) in mydict:
            strata.append(mydict[tuple(x_row)])
        else:
            strata.append(-1)
            print("oh no")
    data['strata'] = strata
    
    s = data['strata'].unique()
    
    new = data.copy()
    
    '''for i in s:
        x = new[new['strata'] == i]
        t1 = x[x[treatment] == 1]
        t2 = x[x[treatment] == 0]
        
        if len(t1) == 0 or len(t2) == 0:
            new = new[new['strata'] != i]'''
    
    
    unique_strata = new['strata'].unique()
    for i in unique_strata:
        x = new[new['strata'] == i]
        t1 = x[x[treatment] == '1']
        t2 = x[x[treatment] == '0']
        if len(t1) == 0 or len(t2) == 0:
            new = new[new['strata'] != i]
        
    
    
    '''for ind in new.index:
        st = new['strata'][ind]
        x = new[new['strata'] == st]
        t1 = x[x[treatment] == '1']
        t2 = x[x[treatment] == '0']
        
        if len(t1) == 0 or len(t2) == 0:
            new['strata'][ind] = -1
    new = new[new['strata'] != -1]
    print("len without -1 ",len(new))'''
    
    weight = []
    for ind in new.index:
        s = new['strata'][ind]
        
        if new[treatment][ind] == '1':
            weight.append(1)
        else:
            ns = new[new['strata'] == s]
            mts = ns[ns[treatment] == '1']
            mcs = ns[ns[treatment] == '0']
                
            #t = new[new['strata'] != -1]
                
            mt = new[new[treatment] == '1']
            mc = new[new[treatment] == '0']
                
            weight.append((len(mc)*len(mts))/(len(mt)*len(mcs)))
    new['weight'] = weight
    return new
tst = pd.DataFrame()
tst['age'] = [5,5,5,10,10,10,15,20,25,30,100,200,5,5]
tst['dummy'] = ['0','0','1','1','1','1','0','1','0','1','0','1','0','1']
tst['name'] = ['s','s','s','m','s','m','a','a','b','b','e','f','x','x']
#tst['name'] = [9,9,9,10,9,10,11,11,12,12,14,15]
tst['class'] = [1,2,3,4,5,6,7,8,9,10,40, 80,90,100]

new = match_data(data = tst,treatment="dummy",control_var=['name','age'])
print(new)

    age dummy name  class  strata  weight
0     5     0    s      1       1    0.75
1     5     0    s      2       1    0.75
2     5     1    s      3       1    1.00
12    5     0    x     90      10    1.50
13    5     1    x    100      10    1.00


In [34]:
from collections import defaultdict


def match_data(data,treatment,control_var):

    
    ##get unique control tuples
    con_data = data[control_var] #control_var is a list of control variables, other than treatment,doi and gender
    un_con = con_data.drop_duplicates()
    
    ##create map of strata for unique control tuples
    mydict = defaultdict(list)
    cnt = 1
    '''for i, tup in enumerate(un_con.iterrows()):
        _, x_row = tup
        #print(x_row)
        mydict[tuple(x_row)] = cnt
        cnt += 1'''
    
    #print(mydict)
    new_data = data[control_var]
    strata = []
    
    for i, tup in enumerate(new_data.iterrows()):
        _, x_row = tup
        if tuple(x_row) in mydict:
            strata.append(mydict[tuple(x_row)])
        else:
            mydict[tuple(x_row)] = cnt
            cnt += 1
            strata.append(mydict[tuple(x_row)])
    data['strata'] = strata
    
    
    
    
    value_counts = data["strata"].value_counts()

# Create a list of the values that have only one instance
    single_instances = value_counts[value_counts == 1].index

# Filter the DataFrame to only include rows where the "dog" value is not in the list of single_instances
    data = data[~data["strata"].isin(single_instances)]

    print(data)
    
    s = data['strata'].unique()
    
    new = data.copy()
    
    '''for i in s:
        x = new[new['strata'] == i]
        t1 = x[x[treatment] == 1]
        t2 = x[x[treatment] == 0]
        
        if len(t1) == 0 or len(t2) == 0:
            new = new[new['strata'] != i]'''
    
    st_tr = dict()
    st_con = dict()
    
    unique_strata = new['strata'].unique()
    for i in unique_strata:
        x = new[new['strata'] == i]
        t1 = x[x[treatment] == '1']
        t2 = x[x[treatment] == '0']
        
        st_tr[i] = len(t1)
        st_con[i] = len(t2)
        
        ##i can save this value for further calculation
        if len(t1) == 0 or len(t2) == 0:
            new = new[new['strata'] != i]
        
    
    
    '''for ind in new.index:
        st = new['strata'][ind]
        x = new[new['strata'] == st]
        t1 = x[x[treatment] == '1']
        t2 = x[x[treatment] == '0']
        
        if len(t1) == 0 or len(t2) == 0:
            new['strata'][ind] = -1
    new = new[new['strata'] != -1]
    print("len without -1 ",len(new))'''
    
    
    
    mt = new[new[treatment] == '1']
    mc = new[new[treatment] == '0']

    new['weight'] = new['strata'].apply(lambda x: (len(mc)*st_tr[x]) / (len(mt)*st_con[x]))
    #new['weight'] = new[treatment].apply(lambda x: 1.00 if x == '1')
    row_index = new[new[treatment] == '1'].index

    # Replace all the values in the age column where the row index is in row_index with 1
    new.loc[row_index, "weight"] = 1.00
    
    
    
    
    '''weight = []
    for ind in new.index:
        s = new['strata'][ind]
        
        if new[treatment][ind] == '1':
            weight.append(1)
        else:
            ns = new[new['strata'] == s]
            mts = ns[ns[treatment] == '1']
            mcs = ns[ns[treatment] == '0']
                
            #t = new[new['strata'] != -1]
                
            
                
            weight.append((len(mc)*len(mts))/(len(mt)*len(mcs)))
    new['weight'] = weight'''
    return new
tst = pd.DataFrame()
tst['age'] = [5,5,5,10,10,10,15,20,25,30,100,200,5,5]
tst['dummy'] = ['0','0','1','1','1','1','0','1','0','1','0','1','0','1']
tst['name'] = ['s','s','s','m','s','m','a','a','b','b','e','f','x','x']
#tst['name'] = [9,9,9,10,9,10,11,11,12,12,14,15]
tst['class'] = [1,2,3,4,5,6,7,8,9,10,40, 80,90,100]

new = match_data(data = tst,treatment="dummy",control_var=['name','age'])
print(new)

    age dummy name  class  strata
0     5     0    s      1       1
1     5     0    s      2       1
2     5     1    s      3       1
3    10     1    m      4       2
5    10     1    m      6       2
12    5     0    x     90      10
13    5     1    x    100      10
    age dummy name  class  strata  weight
0     5     0    s      1       1    0.75
1     5     0    s      2       1    0.75
2     5     1    s      3       1    1.00
12    5     0    x     90      10    1.50
13    5     1    x    100      10    1.00


In [17]:
from collections import defaultdict


def match_data(data,treatment,control_var):

    
    ##get unique control tuples
    con_data = data[control_var] #control_var is a list of control variables, other than treatment,doi and gender
    un_con = con_data.drop_duplicates()
    
    ##create map of strata for unique control tuples
    mydict = defaultdict(list)
    cnt = 1
    '''for i, tup in enumerate(un_con.iterrows()):
        _, x_row = tup
        #print(x_row)
        mydict[tuple(x_row)] = cnt
        cnt += 1'''
    
    #print(mydict)
    new_data = data[control_var]
    strata = []
    
    for i, tup in enumerate(new_data.iterrows()):
        _, x_row = tup
        if tuple(x_row) in mydict:
            strata.append(mydict[tuple(x_row)])
        else:
            mydict[tuple(x_row)] = cnt
            cnt += 1
            strata.append(mydict[tuple(x_row)])
    data['strata'] = strata
    
    
    
    
    value_counts = data["strata"].value_counts()

# Create a list of the values that have only one instance
    single_instances = value_counts[value_counts == 1].index

# Filter the DataFrame to only include rows where the "dog" value is not in the list of single_instances
    data = data[~data["strata"].isin(single_instances)]
    
    
    data["st_tr"] = data.groupby("strata")[treatment].transform(lambda x: len(x[x == '1']))

    data["st_con"] = data.groupby("strata")[treatment].transform(lambda x: len(x[x == '0']))
    
    data = data.loc[(data["st_tr"] != 0) & (data["st_con"] != 0)]
    
    st_tr = dict(zip(data['strata'], data['st_tr']))
    st_con = dict(zip(data['strata'], data['st_con']))
    
    new = data.copy()
    
    
    

    '''print(data)
    
    s = data['strata'].unique()
    
    new = data.copy()
    
    
    st_tr = dict()
    st_con = dict()
    
    unique_strata = new['strata'].unique()
    for i in unique_strata:
        x = new[new['strata'] == i]
        t1 = x[x[treatment] == '1']
        t2 = x[x[treatment] == '0']
        
        st_tr[i] = len(t1)
        st_con[i] = len(t2)
        
        ##i can save this value for further calculation
        if len(t1) == 0 or len(t2) == 0:
            new = new[new['strata'] != i]'''
        
    
    
    '''for ind in new.index:
        st = new['strata'][ind]
        x = new[new['strata'] == st]
        t1 = x[x[treatment] == '1']
        t2 = x[x[treatment] == '0']
        
        if len(t1) == 0 or len(t2) == 0:
            new['strata'][ind] = -1
    new = new[new['strata'] != -1]
    print("len without -1 ",len(new))'''
    
    
    
    mt = new[new[treatment] == '1']
    mc = new[new[treatment] == '0']

    new['weight'] = new['strata'].apply(lambda x: (len(mc)*st_tr[x]) / (len(mt)*st_con[x]))
    #new['weight'] = new[treatment].apply(lambda x: 1.00 if x == '1')
    row_index = new[new[treatment] == '1'].index

    # Replace all the values in the age column where the row index is in row_index with 1
    new.loc[row_index, "weight"] = 1.00
    
    
    
    
    '''weight = []
    for ind in new.index:
        s = new['strata'][ind]
        
        if new[treatment][ind] == '1':
            weight.append(1)
        else:
            ns = new[new['strata'] == s]
            mts = ns[ns[treatment] == '1']
            mcs = ns[ns[treatment] == '0']
                
            #t = new[new['strata'] != -1]
                
            
                
            weight.append((len(mc)*len(mts))/(len(mt)*len(mcs)))
    new['weight'] = weight'''
    return new
tst = pd.DataFrame()
tst['age'] = [5,5,5,10,10,10,15,20,25,30,100,200,5,5]
tst['dummy'] = ['0','0','1','1','1','1','0','1','0','1','0','1','0','1']
tst['name'] = ['s','s','s','m','s','m','a','a','b','b','e','f','x','x']
#tst['name'] = [9,9,9,10,9,10,11,11,12,12,14,15]
tst['class'] = [1,2,3,4,5,6,7,8,9,10,40, 80,90,100]

new = match_data(data = tst,treatment="dummy",control_var=['name','age'])
print(new)

    age dummy name  class  strata  st_tr  st_con  weight
0     5     0    s      1       1      1       2    0.75
1     5     0    s      2       1      1       2    0.75
2     5     1    s      3       1      1       2    1.00
12    5     0    x     90      10      1       1    1.50
13    5     1    x    100      10      1       1    1.00


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["st_tr"] = data.groupby("strata")[treatment].transform(lambda x: len(x[x == '1']))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data["st_con"] = data.groupby("strata")[treatment].transform(lambda x: len(x[x == '0']))


In [10]:
# Create a DataFrame
df = pd.DataFrame({"name": ["Alice", "Bob", "Chris", "Alice", "Ferri", "Bob2", "Dave2", "Ferri","Tom","Ron"], "dog": ["husky", "husky", "pom", "malamute", "malamute", "corgi", "corgi", "pom","harry","harry"]})

# Check the number of instances of each value in the "dog" column
value_counts = df["dog"].value_counts()

# Create a list of the values that have only one instance
single_instances = value_counts[value_counts == 1].index

# Filter the DataFrame to only include rows where the "dog" value is not in the list of single_instances
filtered_df = df[~df["dog"].isin(single_instances)]

# Print the filtered DataFrame
print(filtered_df)

    name       dog
0  Alice     husky
1    Bob     husky
2  Chris       pom
3  Alice  malamute
4  Ferri  malamute
5   Bob2     corgi
6  Dave2     corgi
7  Ferri       pom
8    Tom     harry
9    Ron     harry


In [22]:
import pandas as pd

df = pd.DataFrame()
df['treatment'] = [0,1,0,1,0,1]
df['name'] = ['a','b','c','a','b','c']
df['strata'] = [1,2,1,2,1,2]
df['weight'] = df['treatment'].apply(lambda x: 1 if x == 1 else 0)

df['mts'] = df['strata'].apply(lambda x: df[df.strata == x])

print(df)

   treatment name  strata  val
0          0    a       1    0
1          1    b       2    1
2          0    c       1    0
3          1    a       2    1
4          0    b       1    0
5          1    c       2    1


In [32]:
# Create a DataFrame
df = pd.DataFrame({"student": ["Alice", "Bob", "Chris", "Alice", "Ferri", "Bob2", "Dave2", "Ferri"], "age": [5, 10, 15, 5, 10, 15, 15, 10]})

# Extract the row index where age == 10
row_index = df[df["age"] == 10].index

# Replace all the values in the age column where the row index is in row_index with 1
df.loc[row_index, "age"] = 1

# Print the DataFrame
print(df)

  student  age
0   Alice    5
1     Bob    1
2   Chris   15
3   Alice    5
4   Ferri    1
5    Bob2   15
6   Dave2   15
7   Ferri    1


In [15]:
import numpy as np
import pandas as pd


df = pd.DataFrame()
df['strata'] = [1,1,1,1,1,2,2,3,3,3,4,5]
df['treatment'] = ['0','1','1','0','0','1','1','1','1','0','0','0']

# Create a new column named "count" that counts the number of rows having the strata for that row and treatment = 1
#df["count"] = df.groupby("strata")["treatment"].transform("sum")

df["count"] = df.groupby("strata")["treatment"].transform(lambda x: len(x[x == '1']))

df["count_0"] = df.groupby("strata")["treatment"].transform(lambda x: len(x[x == '0']))

df = df.loc[(df["count"] != 0) & (df["count_0"] != 0)]

strata_count_dict = dict(zip(df['strata'], df['count_0']))

# Print the DataFrame
print(df)

print(strata_count_dict)

   strata treatment  count  count_0
0       1         0      2        3
1       1         1      2        3
2       1         1      2        3
3       1         0      2        3
4       1         0      2        3
7       3         1      2        1
8       3         1      2        1
9       3         0      2        1
{1: 3, 3: 1}
