# **Tree of Life Plan Team B**

## Load Data

In [None]:
import pandas as pd

url = "https://raw.githubusercontent.com/Rose-Petals/TOL-1B/main/TOLCC%20Break%20Through%20Tech%20Dataset%20(1).csv"

df = pd.read_csv(url)

print(df.head())

  INTAKE METHOD        REFERENCES  \
0          Call  Bariatric Doctor   
1          Call     Family/Friend   
2     Boom Form     Family/Friend   
3     Boom Form     Family/Friend   
4     Boom Form     Family/Friend   

  Please be specific on who sent you our way, we'd like to thank them.   \
0                                   Bariatric Doctor                      
1                                                Mom                      
2                                             Friend                      
3                                                NaN                      
4                                                NaN                      

     AGE       TOWN  INSURANCE CARRIER APPOINTMENT LOCATION APPOINTMENT TYPE  \
0  25-30  Elizabeth               BCBS              Virtual     Talk Therapy   
1  10-15     Summit  United Healthcare              Virtual     Talk Therapy   
2  20-25     Monroe               BCBS             Freehold     Talk Therapy   
3  25-

## Data Cleaning

### Removing and Renaming Columns

In [None]:
df = df.drop(columns=[col for col in df.columns if "Unnamed" in col])
df = df.drop(columns=["Please be specific on who sent you our way, we'd like to thank them. "])
print(df.columns)



Index(['INTAKE METHOD', ' REFERENCES', 'AGE', 'TOWN', 'INSURANCE CARRIER',
       'APPOINTMENT LOCATION', 'APPOINTMENT TYPE',
       'If Talk Therapy, specifically what type?', 'Appointment Time'],
      dtype='object')


In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(" ", "_")

print(df.columns)

Index(['intake_method', 'references', 'age', 'town', 'insurance_carrier',
       'appointment_location', 'appointment_type',
       'if_talk_therapy,_specifically_what_type?', 'appointment_time'],
      dtype='object')


In [None]:
df.rename(columns={'if_talk_therapy,_specifically_what_type?': 'talk_therapy_type'}, inplace=True)
df.columns

Index(['intake_method', 'references', 'age', 'town', 'insurance_carrier',
       'appointment_location', 'appointment_type', 'talk_therapy_type',
       'appointment_time'],
      dtype='object')

### Handling Missing Values

In [None]:
print(df.isnull().sum())

intake_method             0
references                2
age                       6
town                      4
insurance_carrier         2
appointment_location      1
appointment_type          1
talk_therapy_type       105
appointment_time          1
dtype: int64


In [None]:
df['talk_therapy_type'] = df['talk_therapy_type'].fillna("not_talk_therapy")
print(df.isnull().sum())

intake_method           0
references              2
age                     6
town                    4
insurance_carrier       2
appointment_location    1
appointment_type        1
talk_therapy_type       0
appointment_time        1
dtype: int64


In [None]:
# Column Types
df.dtypes

Unnamed: 0,0
intake_method,object
references,object
age,object
town,object
insurance_carrier,object
appointment_location,object
appointment_type,object
talk_therapy_type,object
appointment_time,object


In [None]:
# 'age' is a categorical variable with nan values.
print("Number of unique categories for 'age' column : ", df['age'].nunique())
print()
print(df['age'].unique())

Number of unique categories for 'age' column :  44

['25-30' '10-15' '20-25' '30-35' nan '15-20' '60-65' '40-45' '1-5' '5-10'
 '50-55' '55-60' '35-40' '65-70' '35-40; 35-40' '50-55; 45-50'
 '25-30; 30-35' '50-55; 50-55' '30-35; 40-45' '30-35; 30-35'
 '45-50; 40-45' '55--60' '75-80' '70-75' '80-85' '85-90' '35-40; 40-45'
 '65-70; 65-70' '40-45; 50-55' '55-60; 55-60' '25-20' '45-50; 45-50'
 '60-65; 55-60' '55-60; 60-65' '45-50' '30-35; 25-30' '40-45; 35-40'
 '25-30; 25-30' '40-45; 40-45' '55-60; 50-55' '35-40; 30-35'
 '60-65; 60-65' '25-30; 20-25' '30-35; 35-40' '45-50; 50-55']


###### The 'appointment_time' column is our column of interest. So, remove the row with a missing value for 'appointment time.

In [None]:
print("Number of rows in df before drop : ", df.shape[0])
print()
print(df[df['appointment_time'].isnull()])
to_remove_index = df[df['appointment_time'].isnull()].index
df.drop(to_remove_index, inplace = True)
print()
print("Number of rows in df after drop: ", df.shape[0])

Number of rows in df before drop :  1035

     intake_method     references    age         town insurance_carrier  \
1034     Boom Form  Family/Friend  20-25  Union Beach             Aetna   

     appointment_location appointment_type talk_therapy_type appointment_time  
1034                  NaN              NaN  not_talk_therapy              NaN  

Number of rows in df after drop:  1034


In [None]:
# Replacing missing values in the other categorical variables with 'Unknown'
categorical_cols = ['references', 'age', 'town', 'insurance_carrier', 'appointment_location', 'appointment_type']

for col in categorical_cols:
  df[col] = df[col].fillna("Unknown")

print(df.isnull().sum())

intake_method           0
references              0
age                     0
town                    0
insurance_carrier       0
appointment_location    0
appointment_type        0
talk_therapy_type       0
appointment_time        0
dtype: int64


In [None]:
# Number of unique values in each column
df.nunique()

Unnamed: 0,0
intake_method,4
references,13
age,45
town,191
insurance_carrier,17
appointment_location,3
appointment_type,2
talk_therapy_type,7
appointment_time,3


### Reducing number of categories

##### Group some of the categories for columns like age and town that have a large number of unique categories.