In [1]:
import pandas as pd
import dtale
import plotly.express as px 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import re
import time
from datetime import datetime
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
from urllib.request import urlopen
from bs4 import BeautifulSoup
import requests

# Imported Clean CSV

In [47]:
df = pd.read_csv("../data/outcomes.csv")

unique_ids = df['CallerNum'].nunique()
df.head()

Unnamed: 0.1,Unnamed: 0,CallerNum,Treatment Plan - Is the treatment plan completed?,"Treatment Plan - Was the enrollee able to be restored to function (can chew) and ""social six"" esthetics (top front six teeth are present and disease free)",Birth_Year,Presumptive Eligibility - Do they pre-qualify for SMILE ON 60+?,Base-line Oral and Conditions Questions - Are you limited in what you can eat?,Treatment Plan - Was a treatment plan developed?
0,0,3638169,,,1960,Yes,Yes,
1,1,4431194,,,1959,Yes,Yes,
2,2,2941341,No,No,1942,Yes,No,Yes
3,3,3823933,No,No,1960,Yes,Yes,Yes
4,4,3348256,No,No,1955,Yes,Yes,Yes


# Value Counts for two main metrics: Completed Plan and Restorated Function (with duplicate patient ids)

In [3]:
print(df['Treatment Plan - Is the treatment plan completed?'].value_counts())
print(df['Treatment Plan - Was the enrollee able to be restored to function (can chew) and "social six" esthetics (top front six teeth are present and disease free)'].value_counts())

No     19382
Yes     7823
Name: Treatment Plan - Is the treatment plan completed?, dtype: int64
No     12260
Yes     9794
Name: Treatment Plan - Was the enrollee able to be restored to function (can chew) and "social six" esthetics (top front six teeth are present and disease free), dtype: int64


### Drop columns that are not used

In [4]:
df.drop(columns=['Presumptive Eligibility - Do they pre-qualify for SMILE ON 60+?','Birth_Year'], inplace=True)
del df['Unnamed: 0']

# Rename Columns for counting purposes

In [5]:
new_names = {'CallerNum':'Patient_id',
             'Treatment Plan - Is the treatment plan completed?':'Plan_Completed',
             'Treatment Plan - Was the enrollee able to be restored to function (can chew) and "social six" esthetics (top front six teeth are present and disease free)':'Restored_Function',
             'Treatment Plan - Was a treatment plan developed?':'Plan_developed',
             'Base-line Oral and Conditions Questions - Are you limited in what you can eat?':'Limited'}

In [6]:
df.rename(columns=new_names, inplace=True)
df.head()

Unnamed: 0,Patient_id,Plan_Completed,Restored_Function,Limited,Plan_developed
0,3638169,,,Yes,
1,4431194,,,Yes,
2,2941341,No,No,No,Yes
3,3823933,No,No,Yes,Yes
4,3348256,No,No,Yes,Yes


# Sorted and dropped duplicates of Patient ids

In [7]:
sorted = df.sort_values(['Patient_id',
                         'Plan_Completed',
                         'Restored_Function',
                         'Plan_developed',
                         'Limited']).drop_duplicates(['Patient_id'], keep='first')
sorted.head()

Unnamed: 0,Patient_id,Plan_Completed,Restored_Function,Limited,Plan_developed
47543,-2,,,No,Yes
609,-1,No,No,,Yes
34169,2384106,No,No,Yes,Yes
3701,2384744,No,No,No,Yes
54215,2384833,Yes,Yes,No,Yes


In [8]:
print(sorted['Plan_Completed'].value_counts())
print(sorted['Restored_Function'].value_counts())
print(sorted['Plan_developed'].value_counts())
print(sorted['Limited'].value_counts())

No     5522
Yes     883
Name: Plan_Completed, dtype: int64
No     4179
Yes    1411
Name: Restored_Function, dtype: int64
Yes    5745
No      698
Name: Plan_developed, dtype: int64
Yes    5089
No     3946
Name: Limited, dtype: int64


In [9]:
l_no_na = sorted[sorted['Limited'].notna()]
pd_no_na = sorted[sorted['Plan_developed'].notna()]
pc_no_na = sorted[sorted['Plan_Completed'].notna()]
rf_no_na = sorted[sorted['Restored_Function'].notna()]


# Sorted by Submetric: Limited(chew capability), Plan Developed, Plan Completed, and Restored Function.

## Level 1
#### Documented that all submetrics were answered as 'Yes'

In [10]:
All = sorted[(sorted['Limited'] == 'Yes') & 
                (sorted['Plan_developed'] == 'Yes') & 
                (sorted['Plan_Completed'] == 'Yes') & 
                (sorted['Restored_Function'] == 'Yes') ]
All.head(5)

Unnamed: 0,Patient_id,Plan_Completed,Restored_Function,Limited,Plan_developed
52758,2386054,Yes,Yes,Yes,Yes
48392,2386075,Yes,Yes,Yes,Yes
54439,2387558,Yes,Yes,Yes,Yes
42557,2391110,Yes,Yes,Yes,Yes
53674,2392454,Yes,Yes,Yes,Yes


### Percentages  
##### .mul(100).round(1).astype(str) + '%'

In [11]:
level_1= (All['Restored_Function'].value_counts()/unique_ids).mul(100).round(1).astype(str) + '%'
level_1

Yes    2.2%
Name: Restored_Function, dtype: object

## Level 2
#### Documented that Plan was developed, Plan was completed and Returned to Function were answered as 'Yes'

In [12]:
completed_2 = sorted[ (sorted['Plan_developed'] == 'Yes') & 
                      (sorted['Plan_Completed'] == 'Yes') & 
                      (sorted['Restored_Function'] == 'Yes') ]
completed_2.head(5)

Unnamed: 0,Patient_id,Plan_Completed,Restored_Function,Limited,Plan_developed
54215,2384833,Yes,Yes,No,Yes
27851,2384932,Yes,Yes,,Yes
53228,2384971,Yes,Yes,No,Yes
56333,2385168,Yes,Yes,No,Yes
48129,2385321,Yes,Yes,,Yes


In [25]:
level_2= (completed_2['Restored_Function'].value_counts()/unique_ids).mul(100).round(1).astype(str) + '%'
level_2

Yes    6.2%
Name: Restored_Function, dtype: object

## Level 3
#### Documented that a Plan was completed and Returned to Function were answered as 'Yes'

In [15]:
completed_both = sorted[ (sorted['Plan_Completed'] == 'Yes') & 
                      (sorted['Restored_Function'] == 'Yes') ]
completed_both.head(5)

Unnamed: 0,Patient_id,Plan_Completed,Restored_Function,Limited,Plan_developed
54215,2384833,Yes,Yes,No,Yes
27851,2384932,Yes,Yes,,Yes
53228,2384971,Yes,Yes,No,Yes
56333,2385168,Yes,Yes,No,Yes
48129,2385321,Yes,Yes,,Yes


In [26]:
level_3= (completed_both['Restored_Function'].value_counts()/unique_ids).mul(100).round(1).astype(str) + '%'
level_3

Yes    6.9%
Name: Restored_Function, dtype: object

## Level 4
#### Documented that Returned to Function were answered as 'Yes'

In [17]:
completed_rf = sorted[
                      (sorted['Restored_Function'] == 'Yes') ]
completed_rf.head(5)

Unnamed: 0,Patient_id,Plan_Completed,Restored_Function,Limited,Plan_developed
54215,2384833,Yes,Yes,No,Yes
27851,2384932,Yes,Yes,,Yes
55200,2384963,No,Yes,,Yes
53228,2384971,Yes,Yes,No,Yes
53360,2385046,,Yes,No,Yes


In [18]:
level_4= (completed_rf['Restored_Function'].value_counts()/unique_ids).mul(100).round(1).astype(str) + '%'
level_4

Yes    13.2%
Name: Restored_Function, dtype: object

# DISCLAIMER: 'df' refers to the data you passed in when calling 'dtale.show'

duplicates = sorted.duplicated(['Patient_id'], keep='first')
sorted= sorted[~duplicates]
if isinstance(sorted, (pd.DatetimeIndex, pd.MultiIndex)):
    sorted = sorted.to_frame(index=False)

# remove any pre-existing indices for ease of use in the D-Tale code, but this is not required
sorted = sorted.reset_index().drop('index', axis=1, errors='ignore')
sorted.columns = [str(c) for c in df.columns]  # update columns to strings in case they are numbers

sorted.head()

print(sorted['Plan_Completed'].value_counts())
print(sorted['Restored_Function'].value_counts())

In [19]:
print(completed_both['Plan_Completed'].value_counts())

Yes    743
Name: Plan_Completed, dtype: int64


In [20]:
print(completed_both['Patient_id'].value_counts())

2830336    1
2924853    1
2393764    1
2641164    1
2893859    1
          ..
3089764    1
2560356    1
3056404    1
2872678    1
2721448    1
Name: Patient_id, Length: 743, dtype: int64


In [21]:
#dental_df[dental_df['Are you limited to what you eat?'] == 'Yes']['Age'].describe
#dental_df[dental_df['Are you limited to what you eat?'] == 'Yes']['Age'].value_counts(normalize = True)


In [None]:
d = dtale.show(completed_both)

#open it in a new window in browser

d.open_browser()

In [48]:
fig = px.pie(sorted, values='Restored_Function', names='Patient_id')
fig.show()

In [44]:
# Pie chart
fig = px.pie(cont_df, values='gdp', names='continent')
fig.show()
# Bar chart
fig = px.bar(cont_df, color='continent', x='continent', y='gdp')
fig.show()
# Horizontal bar chart - stacked
fig = px.bar(cont_df, color='continent', x='gdp', orientation='h')
fig.show()
# Bubble chart
fig = px.scatter(cont_df.assign(dataType='GDP'), color='continent', x='continent', y='dataType', size='gdp', size_max=50)
fig.show()

ValueError: Value of 'y' is not the name of a column in 'data_frame'. Expected one of ['Patient_id', 'Plan_Completed', 'Restored_Function', 'Limited', 'Plan_developed'] but received: limited