# **Tree of Life Plan Team B**

## Load Data

# *********************DATASET 2 CLEANING*********

In [17]:
import pandas as pd
import numpy as np

url = "https://raw.githubusercontent.com/Rose-Petals/TOL-1B/main/TOLCC%20Break%20Through%20Tech%20Dataset%20(2).csv"

df2 = pd.read_csv(url)


df2.columns

Index(['Intake Method', 'LEAD SOURCE CATEGORY',
       'Please be specific on who sent you our way, we'd like to thank them.',
       'Age', 'Appointment Location', 'APPOINTMENT TYPE', 'Appointment Time'],
      dtype='object')

### Reducing number of categories

##### Group some of the categories for columns like age and town that have a large number of unique categories.

In [18]:
df2.columns = df2.columns.str.strip().str.lower().str.replace(" ", "_")
df2.rename(columns={'lead_source_category': 'referer'}, inplace=True)
df2.drop(columns= {"please_be_specific_on_who_sent_you_our_way,_we'd_like_to_thank_them."}, inplace=True)
print(df2.columns)

Index(['intake_method', 'referer', 'age', 'appointment_location',
       'appointment_type', 'appointment_time'],
      dtype='object')


In [19]:
#print(df2.isnull().sum())
#df2['talk_therapy_type'] = df2['talk_therapy_type'].fillna("not_talk_therapy")
#print(df2.isnull().sum())
df2['appointment_type'].unique()
df2['appointment_type'].value_counts()

appointment_type
Talk Therapy                                 443
Medication Management                        132
Updating Paperwork                            39
Talk Therapy\nMedication Management           22
Talk Therapy & Medication Management          21
SELF-PAY Talk Therapy                         10
Both Talk Therapy & Medication Management      7
Updating Insurance                             3
SELF-PAY Med Management                        1
Name: count, dtype: int64

In [21]:
df2['appointment_type'] = df2['appointment_type'].str.lower()

def clean_appointment_type(x):
    if pd.isnull(x):
        return np.nan
    if 'talk' in x:
        return 'Talk Therapy'
    elif 'med' in x:
        return 'Medication Management'
    else:
        return np.nan

df2['appointment_type'] = df2['appointment_type'].apply(clean_appointment_type)
df2['appointment_type'].value_counts()

appointment_type
Talk Therapy             503
Medication Management    133
Name: count, dtype: int64

In [22]:
sum(df2['appointment_type'].isnull())

60

In [25]:
def average_age(cell):
    if pd.isna(cell):
        return np.nan
    values = str(cell).replace('-', ',').split(',')
    nums = []
    for val in values:
        val = val.strip()
        try:
            nums.append(float(val))
        except ValueError:
            continue
    if not nums:
        return np.nan
    return sum(nums) / len(nums)
df2['age'] = df2['age'].apply(average_age)
df2['age'].fillna(df2['age'].mean(), inplace=True)
df2['age'] = df2['age'].round(1)
print(df2['age'])

0      22.5
1      38.2
2      38.2
3      38.2
4      38.2
       ... 
691    38.2
692    38.2
693    38.2
694    38.2
695    38.2
Name: age, Length: 696, dtype: float64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df2['age'].fillna(df2['age'].mean(), inplace=True)


In [27]:
df2.isnull().sum()

intake_method           194
referer                 110
age                       0
appointment_location    181
appointment_type         60
appointment_time        181
dtype: int64

In [33]:
df2.dropna(subset=['appointment_time'], inplace=True)
df2['intake_method'] = df2['intake_method'].fillna('Unknown')
df2['referer'] = df2['referer'].fillna("Unknown")
df2.isnull().sum()

intake_method            0
referer                  0
age                      0
appointment_location     1
appointment_type        41
appointment_time         0
dtype: int64

In [37]:
df2.dropna(subset=['appointment_type'], inplace=True)
df2['appointment_location'] = df2['appointment_location'].fillna('Unknown')
df2.isnull().sum()

intake_method           0
referer                 0
age                     0
appointment_location    0
appointment_type        0
appointment_time        0
dtype: int64

In [None]:
def time_of_day(t):
    hour = t.hour
    if hour < 12:
        return 'Morning'
    elif 12 <= hour < 17:
        return 'Afternoon'
    else:
        return 'Evening'
df2['appointment_time'] = pd.to_datetime(df2['appointment_time'], format='%I:%M %p')    
df2['appointment_time'] = df2['appointment_time'].apply(time_of_day)




ValueError: time data "Evening" doesn't match format "%I:%M %p", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.

In [43]:
df2['appointment_time'].unique()


array(['Evening', 'Afternoon', 'Morning'], dtype=object)

In [None]:
def 

intake_method           0
referer                 0
age                     0
appointment_location    0
appointment_type        0
appointment_time        0
time_category           0
dtype: int64

In [None]:
df2['Medication_Management'] = (df2['appointment_type'] == 'Medication Management').astype(int)

df2['Talk_Therapy'] = (df2['appointment_type'] != 'Medication Management').astype(int)

In [None]:
df2.drop(columns={'appointment_type'})

Unnamed: 0,intake_method,referer,age,appointment_location,appointment_time,time_category,Medication_Management,Talk_Therapy
0,Waitlist,Other,22.5,Princeton,Evening,Evening,0,1
2,Unknown,Unknown,38.2,Freehold,Evening,Evening,0,1
3,Unknown,Unknown,38.2,Virtual,Afternoon,Afternoon,0,1
4,Unknown,Following Clinician,38.2,Virtual,Evening,Evening,0,1
5,Unknown,Unknown,38.2,Freehold,Afternoon,Afternoon,0,1
...,...,...,...,...,...,...,...,...
689,Other,Other,38.2,Virtual,Afternoon,Afternoon,0,1
690,Boom,Family/Friend,38.2,Princeton,Morning,Morning,0,1
691,Call,Other,38.2,Virtual,Morning,Morning,1,0
692,Boom,Google or another search engine,38.2,Princeton,Afternoon,Afternoon,0,1
