# **Tree of Life Plan Team B**

## Load Data

# *********************DATASET 2 CLEANING*********

In [148]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/Rose-Petals/TOL-1B/main/TOLCC%20Break%20Through%20Tech%20Dataset_2.csv"

df2 = pd.read_csv(url)

print(df2.head())

  INTAKE METHOD        Unnamed: 1  \
0          Call  Bariatric Doctor   
1          Call     Family/Friend   
2     Boom Form     Family/Friend   
3     Boom Form     Family/Friend   
4     Boom Form     Family/Friend   

  Please be specific on who sent you our way, we'd like to thank them.   \
0                                   Bariatric Doctor                      
1                                                Mom                      
2                                             Friend                      
3                                                NaN                      
4                                                NaN                      

     AGE       TOWN  INSURANCE CARRIER APPOINTMENT LOCATION APPOINTMENT TYPE  \
0  25-30  Elizabeth               BCBS              Virtual     Talk Therapy   
1  10-15     Summit  United Healthcare              Virtual     Talk Therapy   
2  20-25     Monroe               BCBS             Freehold     Talk Therapy   
3  25-

### Reducing number of categories

##### Group some of the categories for columns like age and town that have a large number of unique categories.

In [149]:
df2.columns = df2.columns.str.strip().str.lower().str.replace(" ", "_")
df2.rename(columns={'if_talk_therapy,_specifically_what_type?': 'talk_therapy_type', 'unnamed:_1' : 'referer'}, inplace=True)
df2.drop(columns= {"please_be_specific_on_who_sent_you_our_way,_we'd_like_to_thank_them.", "unnamed:_10", "unnamed:_11"}, inplace=True)

In [150]:
#print(df2.isnull().sum())
#df2['talk_therapy_type'] = df2['talk_therapy_type'].fillna("not_talk_therapy")
#print(df2.isnull().sum())
df2['talk_therapy_type'].unique()
df2['talk_therapy_type'].value_counts()

talk_therapy_type
Individual (adult)      593
Individual (minor)      248
Couples                  57
​                        18
Family                    8
Bariatric Evaluation      6
Name: count, dtype: int64

In [151]:
df2['talk_therapy_type'] = df2['talk_therapy_type'].replace('\u200b', np.nan)
df2['talk_therapy_type'] = df2['talk_therapy_type'].replace('Individual (minor)', 'Individual')
df2['talk_therapy_type'] = df2['talk_therapy_type'].replace('Individual (adult)', 'Individual')
df2['talk_therapy_type'].value_counts()

talk_therapy_type
Individual              841
Couples                  57
Family                    8
Bariatric Evaluation      6
Name: count, dtype: int64

In [152]:
def average_age(cell):
    if pd.isna(cell):
        return np.nan
    values = str(cell).replace('-', ',').split(',')
    nums = []
    for val in values:
        val = val.strip()
        try:
            nums.append(float(val))
        except ValueError:
            continue
    if not nums:
        return np.nan
    return sum(nums) / len(nums)
df2['age'] = df2['age'].apply(average_age)
print(df2['age'])

0       27.5
1       12.5
2       22.5
3       27.5
4       32.5
        ... 
1030     7.5
1031    12.5
1032    17.5
1033    42.5
1034    22.5
Name: age, Length: 1035, dtype: float64


In [153]:
df2.head()

Unnamed: 0,intake_method,referer,age,town,insurance_carrier,appointment_location,appointment_type,talk_therapy_type,appointment_time
0,Call,Bariatric Doctor,27.5,Elizabeth,BCBS,Virtual,Talk Therapy,Bariatric Evaluation,Evening (4-8)
1,Call,Family/Friend,12.5,Summit,United Healthcare,Virtual,Talk Therapy,Individual,Afternoon (12-4)
2,Boom Form,Family/Friend,22.5,Monroe,BCBS,Freehold,Talk Therapy,Individual,Afternoon (12-4)
3,Boom Form,Family/Friend,27.5,Ocean,BCBS,Freehold,Talk Therapy,Individual,Afternoon (12-4)
4,Boom Form,Family/Friend,32.5,Jackson,Aetna,Freehold,Talk Therapy,Individual,Afternoon (12-4)


In [154]:
df2['appointment_time'].value_counts()
df2['appointment_time'] = df2['appointment_time'].replace('Evening (4-8)' , 'Evening')
df2['appointment_time'] = df2['appointment_time'].replace('Afternoon (12-4)' , 'Afternoon')
df2['appointment_time'] = df2['appointment_time'].replace('Morning (9-12)' , 'Morning')
df2['appointment_time'].value_counts()


appointment_time
Evening      397
Afternoon    397
Morning      240
Name: count, dtype: int64

In [155]:
df2['insurance_carrier'].unique()


array(['BCBS', 'United Healthcare', 'Aetna', 'Cigna', 'Meritain',
       'Highmark', 'MagnaCare', 'Other', 'UMR', 'Medicare', 'Quest',
       'Amerihealth', 'Optum', 'Oxford', nan, 'Emblem Health',
       'Surest United Healthcare'], dtype=object)

In [156]:
df2['insurance_carrier'].value_counts()


insurance_carrier
BCBS                        474
Aetna                       182
United Healthcare           138
Cigna                       117
Other                        32
Medicare                     25
Meritain                     19
Optum                        11
UMR                           9
MagnaCare                     7
Oxford                        7
Amerihealth                   7
Quest                         2
Highmark                      1
Emblem Health                 1
Surest United Healthcare      1
Name: count, dtype: int64

In [157]:
sum(df2['insurance_carrier'].isnull())

2

In [158]:
df2.isnull().sum()

intake_method             0
referer                   2
age                       6
town                      4
insurance_carrier         2
appointment_location      1
appointment_type          1
talk_therapy_type       123
appointment_time          1
dtype: int64

In [159]:
df2.dropna(subset = ['appointment_time'], inplace = True)

In [160]:
df2.isnull().sum()

intake_method             0
referer                   2
age                       6
town                      4
insurance_carrier         2
appointment_location      0
appointment_type          0
talk_therapy_type       122
appointment_time          0
dtype: int64

In [161]:
df2.fillna({'age': df2['age'].mean(), 'referer' : 'Unknown', 'town' : 'Unknown'}, inplace=True)


In [162]:
df2.isnull().sum()

intake_method             0
referer                   0
age                       0
town                      0
insurance_carrier         2
appointment_location      0
appointment_type          0
talk_therapy_type       122
appointment_time          0
dtype: int64

In [163]:
df2['talk_therapy_type'].unique()

array(['Bariatric Evaluation', 'Individual', nan, 'Couples', 'Family'],
      dtype=object)

In [164]:
df2['talk_therapy_type'] = df2['talk_therapy_type'].fillna('ignore')
therapy_dummies = pd.get_dummies(df2['talk_therapy_type'], prefix = 'therapy')
therapy_dummies = therapy_dummies[['therapy_Individual', 'therapy_Family', 'therapy_Couples', 'therapy_Bariatric Evaluation']].reindex(df2.index,fill_value = 0)
df2['Medication Management'] = (df2['appointment_type'] == 'Medication Management').astype(int)
df2 = pd.concat([df2, therapy_dummies], axis = 1)
therapy_columns = ['therapy_Individual', 'therapy_Family', 'therapy_Couples', 'therapy_Bariatric Evaluation']
df2[therapy_columns] = df2[therapy_columns].astype(int)
df2.drop(['appointment_type', 'talk_therapy_type'], axis = 1, inplace = True)

In [165]:
df2.head()

Unnamed: 0,intake_method,referer,age,town,insurance_carrier,appointment_location,appointment_time,Medication Management,therapy_Individual,therapy_Family,therapy_Couples,therapy_Bariatric Evaluation
0,Call,Bariatric Doctor,27.5,Elizabeth,BCBS,Virtual,Evening,0,0,0,0,1
1,Call,Family/Friend,12.5,Summit,United Healthcare,Virtual,Afternoon,0,1,0,0,0
2,Boom Form,Family/Friend,22.5,Monroe,BCBS,Freehold,Afternoon,0,1,0,0,0
3,Boom Form,Family/Friend,27.5,Ocean,BCBS,Freehold,Afternoon,0,1,0,0,0
4,Boom Form,Family/Friend,32.5,Jackson,Aetna,Freehold,Afternoon,0,1,0,0,0


In [166]:
df2.head()

Unnamed: 0,intake_method,referer,age,town,insurance_carrier,appointment_location,appointment_time,Medication Management,therapy_Individual,therapy_Family,therapy_Couples,therapy_Bariatric Evaluation
0,Call,Bariatric Doctor,27.5,Elizabeth,BCBS,Virtual,Evening,0,0,0,0,1
1,Call,Family/Friend,12.5,Summit,United Healthcare,Virtual,Afternoon,0,1,0,0,0
2,Boom Form,Family/Friend,22.5,Monroe,BCBS,Freehold,Afternoon,0,1,0,0,0
3,Boom Form,Family/Friend,27.5,Ocean,BCBS,Freehold,Afternoon,0,1,0,0,0
4,Boom Form,Family/Friend,32.5,Jackson,Aetna,Freehold,Afternoon,0,1,0,0,0
