# **Tree of Life Plan Team B**

## Load Data

# *********************DATASET 2 CLEANING*********

In [None]:
import pandas as pd
import numpy as np
import scipy.stats as stat

url = "https://raw.githubusercontent.com/Rose-Petals/TOL-1B/main/TOLCC%20Break%20Through%20Tech%20Dataset_2.csv"

df2 = pd.read_csv(url)

print(df2.head())

  INTAKE METHOD        Unnamed: 1  \
0          Call  Bariatric Doctor   
1          Call     Family/Friend   
2     Boom Form     Family/Friend   
3     Boom Form     Family/Friend   
4     Boom Form     Family/Friend   

  Please be specific on who sent you our way, we'd like to thank them.   \
0                                   Bariatric Doctor                      
1                                                Mom                      
2                                             Friend                      
3                                                NaN                      
4                                                NaN                      

     AGE       TOWN  INSURANCE CARRIER APPOINTMENT LOCATION APPOINTMENT TYPE  \
0  25-30  Elizabeth               BCBS              Virtual     Talk Therapy   
1  10-15     Summit  United Healthcare              Virtual     Talk Therapy   
2  20-25     Monroe               BCBS             Freehold     Talk Therapy   
3  25-

### Reducing number of categories

##### Group some of the categories for columns like age and town that have a large number of unique categories.

In [None]:
df2.columns = df2.columns.str.strip().str.lower().str.replace(" ", "_")
df2.rename(columns={'if_talk_therapy,_specifically_what_type?': 'talk_therapy_type', 'unnamed:_1' : 'referer'}, inplace=True)
df2.drop(columns= {"please_be_specific_on_who_sent_you_our_way,_we'd_like_to_thank_them.", "unnamed:_10", "unnamed:_11"}, inplace=True)

In [None]:
#print(df2.isnull().sum())
#df2['talk_therapy_type'] = df2['talk_therapy_type'].fillna("not_talk_therapy")
#print(df2.isnull().sum())
df2['talk_therapy_type'].unique()
df2['talk_therapy_type'].value_counts()

Unnamed: 0_level_0,count
talk_therapy_type,Unnamed: 1_level_1
Individual (adult),593
Individual (minor),248
Couples,57
​,18
Family,8
Bariatric Evaluation,6


In [None]:
df2['talk_therapy_type'] = df2['talk_therapy_type'].replace('\u200b', np.nan)
df2['talk_therapy_type'] = df2['talk_therapy_type'].replace('Individual (minor)', 'Individual')
df2['talk_therapy_type'] = df2['talk_therapy_type'].replace('Individual (adult)', 'Individual')
df2['talk_therapy_type'].value_counts()

Unnamed: 0_level_0,count
talk_therapy_type,Unnamed: 1_level_1
Individual,841
Couples,57
Family,8
Bariatric Evaluation,6


In [None]:
def average_age(cell):
    if pd.isna(cell):
        return np.nan
    values = str(cell).replace('-', ',').split(',')
    nums = []
    for val in values:
        val = val.strip()
        try:
            nums.append(float(val))
        except ValueError:
            continue
    if not nums:
        return np.nan
    return sum(nums) / len(nums)
df2['age'] = df2['age'].apply(average_age)
print(df2['age'])

0       27.5
1       12.5
2       22.5
3       27.5
4       32.5
        ... 
1030     7.5
1031    12.5
1032    17.5
1033    42.5
1034    22.5
Name: age, Length: 1035, dtype: float64


In [None]:
df2.head()

Unnamed: 0,intake_method,referer,age,town,insurance_carrier,appointment_location,appointment_type,talk_therapy_type,appointment_time
0,Call,Bariatric Doctor,27.5,Elizabeth,BCBS,Virtual,Talk Therapy,Bariatric Evaluation,Evening (4-8)
1,Call,Family/Friend,12.5,Summit,United Healthcare,Virtual,Talk Therapy,Individual,Afternoon (12-4)
2,Boom Form,Family/Friend,22.5,Monroe,BCBS,Freehold,Talk Therapy,Individual,Afternoon (12-4)
3,Boom Form,Family/Friend,27.5,Ocean,BCBS,Freehold,Talk Therapy,Individual,Afternoon (12-4)
4,Boom Form,Family/Friend,32.5,Jackson,Aetna,Freehold,Talk Therapy,Individual,Afternoon (12-4)


In [None]:
df2['appointment_time'].value_counts()
df2['appointment_time'] = df2['appointment_time'].replace('Evening (4-8)' , 'Evening')
df2['appointment_time'] = df2['appointment_time'].replace('Afternoon (12-4)' , 'Afternoon')
df2['appointment_time'] = df2['appointment_time'].replace('Morning (9-12)' , 'Morning')
df2['appointment_time'].value_counts()


Unnamed: 0_level_0,count
appointment_time,Unnamed: 1_level_1
Evening,397
Afternoon,397
Morning,240


In [None]:
df2['insurance_carrier'].unique()


array(['BCBS', 'United Healthcare', 'Aetna', 'Cigna', 'Meritain',
       'Highmark', 'MagnaCare', 'Other', 'UMR', 'Medicare', 'Quest',
       'Amerihealth', 'Optum', 'Oxford', nan, 'Emblem Health',
       'Surest United Healthcare'], dtype=object)

In [None]:
df2['insurance_carrier'].value_counts()


Unnamed: 0_level_0,count
insurance_carrier,Unnamed: 1_level_1
BCBS,474
Aetna,182
United Healthcare,138
Cigna,117
Other,32
Medicare,25
Meritain,19
Optum,11
UMR,9
MagnaCare,7


In [None]:
sum(df2['insurance_carrier'].isnull())

2

In [None]:
df2.isnull().sum()

Unnamed: 0,0
intake_method,0
referer,2
age,6
town,4
insurance_carrier,2
appointment_location,1
appointment_type,1
talk_therapy_type,123
appointment_time,1


In [None]:
df2.dropna(subset = ['appointment_time'], inplace = True)

In [None]:
df2.isnull().sum()

Unnamed: 0,0
intake_method,0
referer,2
age,6
town,4
insurance_carrier,2
appointment_location,0
appointment_type,0
talk_therapy_type,122
appointment_time,0


In [None]:
df2.fillna({'age': df2['age'].mean(), 'referer' : 'Unknown', 'town' : 'Unknown'}, inplace=True)


In [None]:
df2.isnull().sum()

Unnamed: 0,0
intake_method,0
referer,0
age,0
town,0
insurance_carrier,2
appointment_location,0
appointment_type,0
talk_therapy_type,122
appointment_time,0


In [None]:
df2['talk_therapy_type'].unique()

array(['Bariatric Evaluation', 'Individual', nan, 'Couples', 'Family'],
      dtype=object)

In [None]:
df2['talk_therapy_type'] = df2['talk_therapy_type'].fillna('ignore')
therapy_dummies = pd.get_dummies(df2['talk_therapy_type'], prefix = 'therapy')
therapy_dummies = therapy_dummies[['therapy_Individual', 'therapy_Family', 'therapy_Couples', 'therapy_Bariatric Evaluation']].reindex(df2.index,fill_value = 0)
df2['Medication Management'] = (df2['appointment_type'] == 'Medication Management').astype(int)
df2 = pd.concat([df2, therapy_dummies], axis = 1)
therapy_columns = ['therapy_Individual', 'therapy_Family', 'therapy_Couples', 'therapy_Bariatric Evaluation']
df2[therapy_columns] = df2[therapy_columns].astype(int)
df2.drop(['appointment_type', 'talk_therapy_type'], axis = 1, inplace = True)

In [None]:
df2.head()

Unnamed: 0,intake_method,referer,age,town,insurance_carrier,appointment_location,appointment_time,Medication Management,therapy_Individual,therapy_Family,therapy_Couples,therapy_Bariatric Evaluation
0,Call,Bariatric Doctor,27.5,Elizabeth,BCBS,Virtual,Evening,0,0,0,0,1
1,Call,Family/Friend,12.5,Summit,United Healthcare,Virtual,Afternoon,0,1,0,0,0
2,Boom Form,Family/Friend,22.5,Monroe,BCBS,Freehold,Afternoon,0,1,0,0,0
3,Boom Form,Family/Friend,27.5,Ocean,BCBS,Freehold,Afternoon,0,1,0,0,0
4,Boom Form,Family/Friend,32.5,Jackson,Aetna,Freehold,Afternoon,0,1,0,0,0


In [None]:
df2.head()

Unnamed: 0,intake_method,referer,age,town,insurance_carrier,appointment_location,appointment_time,Medication Management,therapy_Individual,therapy_Family,therapy_Couples,therapy_Bariatric Evaluation
0,Call,Bariatric Doctor,27.5,Elizabeth,BCBS,Virtual,Evening,0,0,0,0,1
1,Call,Family/Friend,12.5,Summit,United Healthcare,Virtual,Afternoon,0,1,0,0,0
2,Boom Form,Family/Friend,22.5,Monroe,BCBS,Freehold,Afternoon,0,1,0,0,0
3,Boom Form,Family/Friend,27.5,Ocean,BCBS,Freehold,Afternoon,0,1,0,0,0
4,Boom Form,Family/Friend,32.5,Jackson,Aetna,Freehold,Afternoon,0,1,0,0,0


## Statistical Analysis

In [None]:
from scipy.stats import chi2_contingency

### Chi-Squared Test on all categorical columns and the appointment time

In [None]:
categorical_cols = ['intake_method', 'referer', 'town', 'insurance_carrier',
       'appointment_location', 'Medication Management',
       'therapy_Individual', 'therapy_Family', 'therapy_Couples',
       'therapy_Bariatric Evaluation']

sig_columns = []

for col in categorical_cols:
  contingency_all = pd.crosstab(df2[col], df2['appointment_time'])
  chi2, p, dof, expected = chi2_contingency(contingency_all)

  print(col)
  print("Chi-square Statistic:", chi2)
  print("p value", p)
  if p<=0.05:
    print('There is a significant association between both groups')
    #unlikely to happen by change, so we reject null and say that there is an association with appointment time
    sig_columns.append(col)
  else:
    print('There is no significance association between groups')
    #this is saying that there is no evidence that the group depends on appointment time

  print('\n')

intake_method
Chi-square Statistic: 6.94120358034161
p value 0.32631893018453484
There is no significance association between groups


referer
Chi-square Statistic: 29.323627350360788
p value 0.20818861633267147
There is no significance association between groups


town
Chi-square Statistic: 425.1888663123823
p value 0.05454280709567423
There is no significance association between groups


insurance_carrier
Chi-square Statistic: 41.053706773168415
p value 0.08601582380676545
There is no significance association between groups


appointment_location
Chi-square Statistic: 14.801728437687359
p value 0.005130615558483234
There is a significant association between both groups


Medication Management
Chi-square Statistic: 41.220728563642524
p value 1.119521716348284e-09
There is a significant association between both groups


therapy_Individual
Chi-square Statistic: 26.114981417898083
p value 2.1340463000557865e-06
There is a significant association between both groups


therapy_Family
Chi-s

In [None]:
sig_columns

The columns above show a statiscally signficant association with the appointment time indicating that they may influence the appointment times

## Chi-squared of all the categorical columns with binary appointment time column (Morning = 1, Not morning = 0)

In [None]:
df2['binary_time'] = df2['appointment_time']
df2['binary_time'] = df2['binary_time'].apply(lambda x: 1 if x == 'Morning' else 0)
#this function converts appointment time to binary variable: morning vs not morning

In [None]:
for col in sig_columns:
  contingency_sig = pd.crosstab(df2[col], df2['binary_time'])
  chi2, p, dof, expected = chi2_contingency(contingency_sig)

  print(col)
  print("Chi-square Statistic:", chi2)
  print("p value", p)
  if p<=0.05:
    print('There is a significant association with Morning time')
    #unlikely to happen by change, so we reject null and say that there is an association with morning time
  else:
    print('There is no significance association with Morning time')
    #this is saying that there is no evidence that the group depends on appointment time

  print('\n')

appointment_location
Chi-square Statistic: 9.56429326380132
p value 0.008377995178936868
There is a significant association with Morning time


Medication Management
Chi-square Statistic: 23.8940292324009
p value 1.0178696986230436e-06
There is a significant association with Morning time


therapy_Individual
Chi-square Statistic: 16.835262844137993
p value 4.076867683488183e-05
There is a significant association with Morning time




All three categories found signficant above, also seem to have some influence on whether the client goes to a morning appointment or not.

In [None]:
from scipy.stats import f_oneway

groups = []
for name, group in df2.groupby('binary_time'):
    ages = group['age']
    groups.append(ages)

f, p_val = f_oneway(groups[0],groups[1])
print("F statistic: ", f)
print("p value", p_val)

if p<=0.05:
    print('There is a significant difference with Morning time')
else:
    print('There is no significance difference Morning time')
    #this is saying that there is no evidence that age depends on appointment time


F statistic:  42.26203943434918
p value 1.2408167855511682e-10
There is a significant difference with Morning time


Age also seems to have some effect on morning time

## Graphically see the relationships of the variables with Morning Time (To be Continued..)

In [None]:
import matplotlib.pyplot as plt

times = ['Afternoon', 'Evening', 'Morning']
print(times)

virtual = df2[df2['appointment_location'] == 'Virtual']

freehold = df2[df2['appointment_location'] == 'Freehold']
princeton = df2[df2['appointment_location'] == 'Freehold']

virtual_segment = virtual['appointment_time'].value_counts().sort_index()
freehold_segment = freehold['appointment_time'].value_counts().sort_index()
princeton_segment = princeton['appointment_time'].value_counts().sort_index()
