In [None]:
#import dependancies
import pandas as pd
import pyspark
from pyspark.sql import SparkSession, functions as F
import re
import numpy as np
import us
from census import Census
import us
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import joblib
import pyspark
from pyspark.sql import SparkSession, functions as F
import pyspark.pandas as ps
from pyspark.sql.functions import sum as spark_sum, lit, col
from sklearn.preprocessing import StandardScaler
import os


In [None]:
#starting variables
startpath ="_Star_Ratings_and_Display_Measures/" #start of star rating path
fallpath ="_Star_Ratings_Fall_Release/"#path for fall ratings
cpath="_Part_C"#path for part c data
dpath="_Part_D"#path for part d data
finalpath ="_Report_Card_Master_Table.xlsx"#last part of star rating path
firstfive = ["Contract Number","Organization Type","Contract Name","Organization Marketing Name","Parent Organization"]#name of first five columns
lasttwo =["Year","Overall"] #name of what will be final 2 columns
apikey = ""
fields = ['NAME', 'B01001_007E', 'B01001_008E']#fields used for census data


## **Clean Star Ratings**

In [None]:
#function to get measure star information
#takes in path of the file
#returns dataframe of the star information
def get_measure_stars(path):
    dfms = pd.read_excel(path,"Measure_Stars",header=2)#df of the raw information
    dfms = dfms.iloc[1:]#remove first line of data
    #loop through the columns after the first five
    for x in range(5,len(dfms.keys())):
        #remove letter number information from column name
        dfms =dfms.rename(columns= {dfms.keys()[x] : re.split(r'\d+:',dfms.keys()[x])[0]+re.split(r'\d+:',dfms.keys()[x])[1]})
    #loop through the first 5 columns to give them proper names
    for x in range(5):
        #rename the columns
        dfms = dfms.rename(columns={f"Unnamed: {x}":firstfive[x]})
    #return the data frame
    return dfms

In [None]:
#function to get Domain star information
#takes in path of the file
#returns dataframe of the star information
def get_domain_stars(path):
    dfds = pd.read_excel(path,"Domain_Stars",header=1)
    return dfds

In [None]:
#function to get summary star information
#takes in path of the file
#returns dataframe of the star information
def get_summary_rating(path):
    #read in data frame
    dfsr = pd.read_excel(path,"Summary_Rating",header=1)
    #remove the sanction deduction column
    dfsr = dfsr.drop(columns="Sanction Deduction",errors='ignore')
    #retrun dataframe
    return dfsr

In [None]:
#function to combine the 3 dataframes and do basic cleanup on them used for early years
#takes year of the function
#returns the cleaned dataframe
def get_early(y):
    #build path for files
    fullpath = f"./Data/{y}{startpath}{y}{fallpath}{y}{finalpath}"
    #get measure stars info
    dfmsf = get_measure_stars(fullpath)
    #get domain star info
    dfdsf = get_domain_stars(fullpath)
    #get summary star info
    dfsrf = get_summary_rating(fullpath)
    #merge measure and domain stars
    dff = pd.merge(dfmsf,dfdsf,on=firstfive,how='left')
    #merge measure, domain, and summary star dataframes
    dff = pd.merge(dff,dfsrf,on=firstfive,how='left')
    #add year column
    dff["Year"] = f"{y}"
    #rename columns
    dff = dff.rename(columns={f"{y} Part C Summary":"Part C Summary",f"{y} Overall":"Overall",f"{y} Part D Summary":"Part D Summary"})
    #drop unneeded columns
    dff = dff.drop(columns = "2017 Disaster %",errors='ignore')
    #return dataframe
    return dff

In [None]:
#function to combine the 3 dataframes and do basic cleanup on them used for later years
#takes year of the function
#returns the cleaned dataframe
def get_late(y):
    #build path for files
    fullpath = f"./Data/{y}{startpath}{y}{finalpath}"
    #get measure stars info
    dfmsc = get_measure_stars(fullpath)
    #get domain star info
    dfdsc = get_domain_stars(fullpath)
    #get summary star info
    dfsrc = get_summary_rating(fullpath)
    #drop disaster columns
    dfsrc = dfsrc.drop(columns=dfsrc.columns[[6,7]],axis = 1)
    #merge measure and domain stars
    df = pd.merge(dfmsc,dfdsc,on=firstfive,how='left')
    #merge measure, domain, and summary star dataframes
    df = pd.merge(df,dfsrc,on=firstfive,how='left')
    #rename columns
    df = df.rename(columns={f"{y} Part C Summary":"Part C Summary",f"{y} Overall":"Overall",f"{y} Part D Summary":"Part D Summary"})
    #add year column
    df["Year"] = f"{y}"
    #return dataframe
    return    df

In [None]:
#function to combine the dataframse for all of the years
#no input
#returns data frame with data from all years
def standardize_data():
    #set first year
    year = 2014
    #set years of data
    years = [x for x in range(2015,2026)]
    #set path for part C 2014        
    fullpath = f"./Data/{year}{startpath}{year}{fallpath}{year}{cpath}{finalpath}"
    #get measure star dataframe for part C 2014
    dfmsc = get_measure_stars(fullpath)
    #get domain star dataframe for part C 2014
    dfdsc = get_domain_stars(fullpath)
    #get summary star dataframe for part C 2014
    dfsrc = get_summary_rating(fullpath)
    #merge measure and domain star dataframes for part C 2014
    df = pd.merge(dfmsc,dfdsc,on=firstfive,how='left')
    #merge measure, domain, and summary star dataframes for part C 2014
    df = pd.merge(df,dfsrc,on=firstfive,how='left')
    #set path for part D 2014    
    fullpath = f"./Data/{year}{startpath}{year}{fallpath}{year}{dpath}{finalpath}"
    #get measure star dataframe for part D 2014
    dfmsd = get_measure_stars(fullpath)
    #get domain star dataframe for part D 2014
    dfdsd = get_domain_stars(fullpath)
    #get summary star dataframe for part D 2014
    dfsrd = get_summary_rating(fullpath)
    #merge measure and domain star dataframes for part D 2014
    dfd = pd.merge(dfmsd,dfdsd,on=['Contract Number'],how='left')
    #merge measure, domain, and summary star dataframes for part D 2014
    dfd = pd.merge(dfd,dfsrd,on=['Contract Number'],how='left')
    #drop duplicate columns
    dfd = dfd.drop(columns=["Organization Type_x","Contract Name_x","Organization Marketing Name_x","Parent Organization_x","Organization Type_y","Contract Name_y","Organization Marketing Name_y","Parent Organization_y"])
    #merge part C and D data
    df = pd.merge(df,dfd,on=["Contract Number","Organization Type","Contract Name","Organization Marketing Name","Parent Organization","SNP"],how='left')
    #rename columns
    df = df.rename(columns={f"{year} Part C Summary Rating":"Part C Summary",f"{year} Overall Rating":"Overall",f"{year} Part D Summary Rating":"Part D Summary"})
    #add year to  dataframes
    df["Year"] = "2014"
    finaldf = df
    #loop through years
    for year in years:
        #choose which merger to call if early call early otherwise call late
        if (year <2020):
            df = get_early(year)
        else:
            df = get_late(year)
        #concatinate the dataframes into one big data frame
        newdf = pd.concat([finaldf,df],axis=0,join='outer')
        #change the dataframe so it can be reused
        finaldf=newdf
    #reset index of dataframe    
    finaldf = finaldf.reset_index(drop=True)
    #return final dataframe
    return finaldf

In [None]:
#function to clean the dataframe
#takes in a dataframe
#returns cleaned dataframe
def clean_data(combodf1):
    #loop through columns in dataframe
    for x in range(len(combodf1.keys())):
        #change datatypes to string
        combodf1[combodf1.keys()[x]] = combodf1[combodf1.keys()[x]].astype(str)
    #trim various typs of data to remove excess spaces
    combodf1 = combodf1.replace('\\s*Plan\\s*too\\s*new\\s*to\\s*be\\s*measured\\s*','Plan too new to be measured',regex=True)
    combodf1 = combodf1.replace('\\s*Plan\\s*too\\s*small\\s*to\\s*be\\s*measured\\s*','Plan too small to be measured',regex=True)
    combodf1 = combodf1.replace('\\s*Plan\\s*not\\s*required\\s*to\\s*report\\s*measure\\s*','Plan not required to report measure',regex=True)
    combodf1 = combodf1.replace('\\s*No\\s*data\\s*available\\s*','No data available',regex=True)
    combodf1 = combodf1.replace('Nodata available','No data available',regex=True)
    combodf1 = combodf1.replace('\\s*Not\\s*enough\\s*data\\s*available\\s*','Not enough data available',regex=True)
    combodf1 = combodf1.replace('\\s*Benefit\\s*not\\s*offered\\s*by\\s*plan\\s*','Benefit not offered by plan',regex=True)
    #change yes and not to true and false
    combodf1 = combodf1.replace('\\s*Yes\\s*',True,regex=True)    
    combodf1 = combodf1.replace('\\s+No\\s+',False,regex=True)
    combodf1 = combodf1.replace('\\s+No',False,regex=True)
    combodf1 = combodf1.replace('No\\s+',False,regex=True)
    #replace nans that were created by adding columns with 0s
    combodf1 = combodf1.replace('nan','0')
    #return the dataframe
    return combodf1

In [None]:
#function to be used to create non numeric columns
#takes in a dataframe item
#returns either that item or a string
def add_non_numeric_cols(item):
    #attempt to set type to decimal
    try:
        #if it was able to make a decimal return numeric
        val = float(item)
        return "Numeric"
    except:
        #otherwise return the string
        return item

In [None]:
#function to clean column of non numeric data
#takes in dataframe item
#returns 0 or the decimal version of item.
def change_to_dec(item):
    #attemp to set type to decimal
    try:
        #if successful return item
        val = float(item)
        return val
    except:
        #otherwise return 0
        return 0

In [None]:
#function to bucket standardized enrollment
#takes in dataframe item
#returns bucket of the data
def change_enrolled(item):
    #first bucket
    if item <.05:
        return 0
    #second bucket
    elif item < .3:
        return 1
    #third bucket
    elif item < .7:
        return 2
    #final bucket
    else:
        return 3

In [None]:
#function to add enrollment data to a dataframe
#takes in a dataframe
#returns dataframe combined with enrollment data
def combine_enrollment(newdf1):
    #read in enrollment data
    enrollmentcsv = pd.read_excel("./Data/Enrollment.xlsx")
    #read in state population
    statepop = pd.read_csv("./CSVs/StatePopulations.csv")
    #map state fips and abbreviation
    abfips = us.states.mapping('fips','abbr')
    #change mapping into dataframe
    abfips = pd.DataFrame.from_dict([abfips])
    abfips = abfips.transpose()
    abfips = abfips.reset_index()
    abfips = abfips.rename(columns={"index":"fips",0:"abbr"})
    #merge enrollment and state info
    enrollab = pd.merge(enrollmentcsv,abfips,left_on='State',right_on='abbr')
    enrollab = enrollab.drop(columns="abbr")
    enrollab['fips'] = enrollab['fips'].astype(int)
    #merge enrollement with state population
    enrollabpop = pd.merge(enrollab,statepop,left_on=['fips','Year'],right_on=['state','Year'])
    enrollabpop = enrollabpop.drop(columns=['state'])
    #find average enrollment in a year
    testgroup = pd.DataFrame(enrollabpop.groupby(['Contract Number','State','Year'])['Total Enrollments by State'].mean())
    testgroup = testgroup.reset_index()
    testgroup['Total Enrollments by State'] = testgroup['Total Enrollments by State'].round(0)
    #rename columns
    testgroup = testgroup.rename(columns={"Total Enrollments by State":"Avg Enrollment"})
    #merge average enrollment into enrollment
    enrollwavg = pd.merge(testgroup,enrollabpop,on=['Contract Number', 'State','Year'])
    #standardize enrollment for state population
    enrollwavg["Standardized Enrollment"] = enrollwavg['Avg Enrollment']/enrollwavg['Population Over 65']
    #remove unneccesary columns and drop duplicates that removing columns caused
    enrollwavg1 = enrollwavg.copy()
    enrollwavg1 = enrollwavg1.drop(columns=['Month','Total Enrollments by State'])
    enrollwavg1 = enrollwavg1.drop_duplicates()
    datatomerge = enrollwavg1.groupby(["Contract Number","Year"])['Standardized Enrollment'].mean()
    datatomerge = datatomerge.reset_index()
    #merge enrollment data and star rating dataframe
    finaldf = pd.merge(newdf1,datatomerge,on=['Contract Number','Year'])
    #bucket the standardized enrollment data
    finaldf["Standardized Enrollment"] = finaldf["Standardized Enrollment"].apply(change_enrolled)
    #change enrollment column to int
    finaldf["Standardized Enrollment"] = finaldf["Standardized Enrollment"].astype(int)
    #return new dataframe
    return finaldf


In [None]:
#Funtion to split numeric and non-numeric columns
#takes in a dataframe
#returns a dataframe with numeric and non-numeric columns
def split_numeric_nonnumeric_data(combodf4):
    #get list of columns
    cols = combodf4.keys().to_list()
    cols.append("Overall")
    #loop through first five columns
    for col in firstfive:
        cols.remove(col)
    #loop through last 2 columns
    for col in lasttwo:
        cols.remove(col)
    #loops througy columns in dataframe
    for col in cols:
        #add non-numeric columns
        combodf4[col+" Non-numeric"] = combodf4[col].apply(add_non_numeric_cols)
        #change old columns to numeric
        combodf4[col]=combodf4[col].apply(change_to_dec)
    newcols = []#placeholder for new column order
    #loop through columns
    for col in cols:
        #add numeric column
        newcols.append(col)
        #add non-numeric column
        newcols.append(col+" Non-numeric")
    finalcols =[]#placeholder for final column order
    #add columns to final columns
    for col in firstfive:
        finalcols.append(col)
    finalcols.append("Year")
    for col in newcols:
        finalcols.append(col)
    #change order of columns in dataframe
    newdf1 = combodf4[finalcols]
    return newdf1

In [None]:
combodf = standardize_data()
combodf = clean_data(combodf)

In [None]:
combodf = standardize_data()
combodf.to_csv("./CSVs/CombinedData.csv",header=True,index=False)

In [None]:
combodf = clean_data(combodf)
combodf.to_csv("./CSVs/CombinedDataCleaned.csv",header=True,index=False)

  combodf1 = combodf1.replace('\\s*No\\s*',False,regex=True)


In [None]:
newdf1=split_numeric_nonnumeric_data(combodf)
newdf1.to_csv("./CSVs/CombinedDataNoEnrollment.csv",header=True,index=False)

## **Pull Census Data**

In [1]:
#function to create df of state populations
#takes a list of population data and the year
#returns a data fram with state populations and the year of the population
def create_year_pop_df(ls,year):
    #create the df
    df = pd.DataFrame(ls)
    #create population column
    df['Population Over 65'] = df['B01001_007E']+df['B01001_008E']
    #create year column
    df['Year'] = year
    #drop unneeded "states"
    df.drop(df.index[df['NAME'] == 'District of Columbia'], inplace=True)
    df.drop(df.index[df['NAME'] == 'Puerto Rico'], inplace=True)
    #order columns and drop unneeded ones
    dfother = df[['NAME','state','Population Over 65', 'Year']]
    return dfother

In [2]:
#funtion tocombine all of the year data frames into one dataframe
#takes a list of population information
#returns a combined dataframe
def create_dfs(poy):
    #set years of state data
    years = [y for y in range(2014,2024)]
    #create starting df
    df1 = create_year_pop_df(poy[0],2014)
    #loop through the years of dataframes
    for x in range(1,len(years)):
        #create dataframe
        df2 = create_year_pop_df(poy[x],years[x])
        #concatinate dataframes
        df1 = pd.concat([df1,df2])
    return(df1)

In [3]:
#create census instance
c = Census(apikey)
#create mapping of fips and abbreviation
abfips = us.states.mapping('fips','abbr')
popsoveryear = []#placeholder for data
tracker = 0#keep track of iterations
#loop through years for data
for y in range(2014,2024):
    #pull data from census
    popsoveryear.append(c.acs5.get((fields),
          {'for': 'state:*'},year = y))
    #increment tracker
    tracker+=1
    #create the dataframe
df8 = create_dfs(popsoveryear)
#save dataframe
df8.to_csv('./CSVs/StatePopulations.csv',header=True,index=False)

NameError: name 'Census' is not defined

## Enrollment creation

In [None]:
# Create a Spark session
spark = SparkSession.builder \
    .appName("Enrollment_CSV_Merge") \
    .getOrCreate()

In [None]:
# Read in Enrollment files
enrollment_2013_01_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_01.csv", header=True, inferSchema=True)
enrollment_2013_02_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_02.csv", header=True, inferSchema=True)
enrollment_2013_03_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_03.csv", header=True, inferSchema=True)
enrollment_2013_04_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_04.csv", header=True, inferSchema=True)
enrollment_2013_05_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_05.csv", header=True, inferSchema=True)
enrollment_2013_06_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_06.csv", header=True, inferSchema=True)
enrollment_2013_07_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_07.csv", header=True, inferSchema=True)
enrollment_2013_08_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_08.csv", header=True, inferSchema=True)
enrollment_2013_09_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_09.csv", header=True, inferSchema=True)
enrollment_2013_10_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_10.csv", header=True, inferSchema=True)
enrollment_2013_11_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_11.csv", header=True, inferSchema=True)
enrollment_2013_12_df = spark.read.csv("./2013/CPSC_Enrollment_Info_2013_12.csv", header=True, inferSchema=True)
enrollment_2014_01_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_01.csv", header=True, inferSchema=True)
enrollment_2014_02_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_02.csv", header=True, inferSchema=True)
enrollment_2014_03_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_03.csv", header=True, inferSchema=True)
enrollment_2014_04_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_04.csv", header=True, inferSchema=True)
enrollment_2014_05_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_05.csv", header=True, inferSchema=True)
enrollment_2014_06_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_06.csv", header=True, inferSchema=True)
enrollment_2014_07_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_07.csv", header=True, inferSchema=True)
enrollment_2014_08_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_08.csv", header=True, inferSchema=True)
enrollment_2014_09_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_09.csv", header=True, inferSchema=True)
enrollment_2014_10_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_10.csv", header=True, inferSchema=True)
enrollment_2014_11_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_11.csv", header=True, inferSchema=True)
enrollment_2014_12_df = spark.read.csv("./2014/CPSC_Enrollment_Info_2014_12.csv", header=True, inferSchema=True)
enrollment_2015_01_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_01.csv", header=True, inferSchema=True)
enrollment_2015_02_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_02.csv", header=True, inferSchema=True)
enrollment_2015_03_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_03.csv", header=True, inferSchema=True)
enrollment_2015_04_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_04.csv", header=True, inferSchema=True)
enrollment_2015_05_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_05.csv", header=True, inferSchema=True)
enrollment_2015_06_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_06.csv", header=True, inferSchema=True)
enrollment_2015_07_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_07.csv", header=True, inferSchema=True)
enrollment_2015_08_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_08.csv", header=True, inferSchema=True)
enrollment_2015_09_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_09.csv", header=True, inferSchema=True)
enrollment_2015_10_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_10.csv", header=True, inferSchema=True)
enrollment_2015_11_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_11.csv", header=True, inferSchema=True)
enrollment_2015_12_df = spark.read.csv("./2015/CPSC_Enrollment_Info_2015_12.csv", header=True, inferSchema=True)
enrollment_2016_01_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_01.csv", header=True, inferSchema=True)
enrollment_2016_02_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_02.csv", header=True, inferSchema=True)
enrollment_2016_03_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_03.csv", header=True, inferSchema=True)
enrollment_2016_04_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_04.csv", header=True, inferSchema=True)
enrollment_2016_05_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_05.csv", header=True, inferSchema=True)
enrollment_2016_06_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_06.csv", header=True, inferSchema=True)
enrollment_2016_07_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_07.csv", header=True, inferSchema=True)
enrollment_2016_08_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_08.csv", header=True, inferSchema=True)
enrollment_2016_09_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_09.csv", header=True, inferSchema=True)
enrollment_2016_10_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_10.csv", header=True, inferSchema=True)
enrollment_2016_11_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_11.csv", header=True, inferSchema=True)
enrollment_2016_12_df = spark.read.csv("./2016/CPSC_Enrollment_Info_2016_12.csv", header=True, inferSchema=True)
enrollment_2017_01_df = spark.read.csv("./2017/CPSC_Enrollment_Info_2017_01.csv", header=True, inferSchema=True)
enrollment_2017_02_df = spark.read.csv("./2017/CPSC_Enrollment_Info_2017_02.csv", header=True, inferSchema=True)
enrollment_2017_07_df = spark.read.csv("./2017/CPSC_Enrollment_Info_2017_07.csv", header=True, inferSchema=True)
enrollment_2017_08_df = spark.read.csv("./2017/CPSC_Enrollment_Info_2017_08.csv", header=True, inferSchema=True)
enrollment_2017_09_df = spark.read.csv("./2017/CPSC_Enrollment_Info_2017_09.csv", header=True, inferSchema=True)
enrollment_2017_10_df = spark.read.csv("./2017/CPSC_Enrollment_Info_2017_10.csv", header=True, inferSchema=True)
enrollment_2017_12_df = spark.read.csv("./2017/CPSC_Enrollment_Info_2017_12.csv", header=True, inferSchema=True)
enrollment_2018_01_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_01.csv", header=True, inferSchema=True)
enrollment_2018_02_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_02.csv", header=True, inferSchema=True)
enrollment_2018_03_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_03.csv", header=True, inferSchema=True)
enrollment_2018_04_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_04.csv", header=True, inferSchema=True)
enrollment_2018_05_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_05.csv", header=True, inferSchema=True)
enrollment_2018_06_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_06.csv", header=True, inferSchema=True)
enrollment_2018_07_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_07.csv", header=True, inferSchema=True)
#enrollment_2018_08_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_08.csv", header=True, inferSchema=True)
enrollment_2018_09_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_09.csv", header=True, inferSchema=True)
enrollment_2018_10_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_10.csv", header=True, inferSchema=True)
enrollment_2018_11_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_11.csv", header=True, inferSchema=True)
enrollment_2018_12_df = spark.read.csv("./2018/CPSC_Enrollment_Info_2018_12.csv", header=True, inferSchema=True)
enrollment_2019_01_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_01.csv", header=True, inferSchema=True)
enrollment_2019_02_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_02.csv", header=True, inferSchema=True)
enrollment_2019_03_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_03.csv", header=True, inferSchema=True)
enrollment_2019_04_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_04.csv", header=True, inferSchema=True)
enrollment_2019_05_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_05.csv", header=True, inferSchema=True)
enrollment_2019_06_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_06.csv", header=True, inferSchema=True)
enrollment_2019_07_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_07.csv", header=True, inferSchema=True)
enrollment_2019_08_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_08.csv", header=True, inferSchema=True)
enrollment_2019_09_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_09.csv", header=True, inferSchema=True)
enrollment_2019_10_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_10.csv", header=True, inferSchema=True)
enrollment_2019_11_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_11.csv", header=True, inferSchema=True)
enrollment_2019_12_df = spark.read.csv("./2019/CPSC_Enrollment_Info_2019_12.csv", header=True, inferSchema=True)
enrollment_2020_01_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_01.csv", header=True, inferSchema=True)
enrollment_2020_02_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_02.csv", header=True, inferSchema=True)
enrollment_2020_03_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_03.csv", header=True, inferSchema=True)
enrollment_2020_04_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_04.csv", header=True, inferSchema=True)
enrollment_2020_05_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_05.csv", header=True, inferSchema=True)
enrollment_2020_06_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_06.csv", header=True, inferSchema=True)
enrollment_2020_07_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_07.csv", header=True, inferSchema=True)
enrollment_2020_08_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_08.csv", header=True, inferSchema=True)
enrollment_2020_09_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_09.csv", header=True, inferSchema=True)
enrollment_2020_10_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_10.csv", header=True, inferSchema=True)
enrollment_2020_11_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_11.csv", header=True, inferSchema=True)
enrollment_2020_12_df = spark.read.csv("./2020/CPSC_Enrollment_Info_2020_12.csv", header=True, inferSchema=True)
enrollment_2021_01_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_01.csv", header=True, inferSchema=True)
enrollment_2021_02_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_02.csv", header=True, inferSchema=True)
enrollment_2021_03_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_03.csv", header=True, inferSchema=True)
enrollment_2021_04_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_04.csv", header=True, inferSchema=True)
enrollment_2021_05_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_05.csv", header=True, inferSchema=True)
enrollment_2021_06_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_06.csv", header=True, inferSchema=True)
enrollment_2021_07_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_07.csv", header=True, inferSchema=True)
enrollment_2021_08_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_08.csv", header=True, inferSchema=True)
enrollment_2021_09_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_09.csv", header=True, inferSchema=True)
enrollment_2021_10_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_10.csv", header=True, inferSchema=True)
enrollment_2021_11_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_11.csv", header=True, inferSchema=True)
enrollment_2021_12_df = spark.read.csv("./2021/CPSC_Enrollment_Info_2021_12.csv", header=True, inferSchema=True)
enrollment_2022_01_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_01.csv", header=True, inferSchema=True)
enrollment_2022_02_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_02.csv", header=True, inferSchema=True)
enrollment_2022_03_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_03.csv", header=True, inferSchema=True)
enrollment_2022_04_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_04.csv", header=True, inferSchema=True)
enrollment_2022_05_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_05.csv", header=True, inferSchema=True)
enrollment_2022_06_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_06.csv", header=True, inferSchema=True)
enrollment_2022_07_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_07.csv", header=True, inferSchema=True)
enrollment_2022_08_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_08.csv", header=True, inferSchema=True)
enrollment_2022_09_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_09.csv", header=True, inferSchema=True)
enrollment_2022_10_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_10.csv", header=True, inferSchema=True)
enrollment_2022_11_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_11.csv", header=True, inferSchema=True)
enrollment_2022_12_df = spark.read.csv("./2022/CPSC_Enrollment_Info_2022_12.csv", header=True, inferSchema=True)
enrollment_2023_01_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_01.csv", header=True, inferSchema=True)
enrollment_2023_02_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_02.csv", header=True, inferSchema=True)
enrollment_2023_03_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_03.csv", header=True, inferSchema=True)
enrollment_2023_04_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_04.csv", header=True, inferSchema=True)
enrollment_2023_05_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_05.csv", header=True, inferSchema=True)
enrollment_2023_06_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_06.csv", header=True, inferSchema=True)
enrollment_2023_07_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_07.csv", header=True, inferSchema=True)
enrollment_2023_08_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_08.csv", header=True, inferSchema=True)
enrollment_2023_09_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_09.csv", header=True, inferSchema=True)
enrollment_2023_10_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_10.csv", header=True, inferSchema=True)
enrollment_2023_11_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_11.csv", header=True, inferSchema=True)
enrollment_2023_12_df = spark.read.csv("./2023/CPSC_Enrollment_Info_2023_12.csv", header=True, inferSchema=True)


In [None]:
# FINAL Code to combine enrollment data

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, lit, sum
import os

# Start a Spark session
spark = SparkSession.builder.appName("EnrollmentProcessing").getOrCreate()

enrollment_dfs = {
    "enrollment_2013_01_df": enrollment_2013_01_df,
    "enrollment_2013_02_df": enrollment_2013_02_df,
    "enrollment_2013_03_df": enrollment_2013_03_df,
    "enrollment_2013_04_df": enrollment_2013_04_df,
    "enrollment_2013_05_df": enrollment_2013_05_df,
    "enrollment_2013_06_df": enrollment_2013_06_df,
    "enrollment_2013_07_df": enrollment_2013_07_df,
    "enrollment_2013_08_df": enrollment_2013_08_df,
    "enrollment_2013_09_df": enrollment_2013_09_df,
    "enrollment_2013_10_df": enrollment_2013_10_df,
    "enrollment_2013_11_df": enrollment_2013_11_df,
    "enrollment_2013_12_df": enrollment_2013_12_df,
    "enrollment_2014_01_df": enrollment_2014_01_df,
    "enrollment_2014_02_df": enrollment_2014_02_df,
    "enrollment_2014_03_df": enrollment_2014_03_df,
    "enrollment_2014_04_df": enrollment_2014_04_df,
    "enrollment_2014_05_df": enrollment_2014_05_df,
    "enrollment_2014_06_df": enrollment_2014_06_df,
    "enrollment_2014_07_df": enrollment_2014_07_df,
    "enrollment_2014_08_df": enrollment_2014_08_df,
    "enrollment_2014_09_df": enrollment_2014_09_df,
    "enrollment_2014_10_df": enrollment_2014_10_df,
    "enrollment_2014_11_df": enrollment_2014_11_df,
    "enrollment_2014_12_df": enrollment_2014_12_df,
    "enrollment_2015_01_df": enrollment_2015_01_df,
    "enrollment_2015_02_df": enrollment_2015_02_df,
    "enrollment_2015_03_df": enrollment_2015_03_df,
    "enrollment_2015_04_df": enrollment_2015_04_df,
    "enrollment_2015_05_df": enrollment_2015_05_df,
    "enrollment_2015_06_df": enrollment_2015_06_df,
    "enrollment_2015_07_df": enrollment_2015_07_df,
    "enrollment_2015_08_df": enrollment_2015_08_df,
    "enrollment_2015_09_df": enrollment_2015_09_df,
    "enrollment_2015_10_df": enrollment_2015_10_df,
    "enrollment_2015_11_df": enrollment_2015_11_df,
    "enrollment_2015_12_df": enrollment_2015_12_df,
    "enrollment_2016_01_df": enrollment_2016_01_df,
    "enrollment_2016_02_df": enrollment_2016_02_df,
    "enrollment_2016_03_df": enrollment_2016_03_df,
    "enrollment_2016_04_df": enrollment_2016_04_df,
    "enrollment_2016_05_df": enrollment_2016_05_df,
    "enrollment_2016_06_df": enrollment_2016_06_df,
    "enrollment_2016_07_df": enrollment_2016_07_df,
    "enrollment_2016_08_df": enrollment_2016_08_df,
    "enrollment_2016_09_df": enrollment_2016_09_df,
    "enrollment_2016_10_df": enrollment_2016_10_df,
    "enrollment_2016_11_df": enrollment_2016_11_df,
    "enrollment_2016_12_df": enrollment_2016_12_df,
    "enrollment_2017_01_df": enrollment_2017_01_df,
    "enrollment_2017_02_df": enrollment_2017_02_df,
    "enrollment_2017_07_df": enrollment_2017_07_df,
    "enrollment_2017_08_df": enrollment_2017_08_df,
    "enrollment_2017_09_df": enrollment_2017_09_df,
    "enrollment_2017_10_df": enrollment_2017_10_df,
    "enrollment_2017_12_df": enrollment_2017_12_df,
    "enrollment_2018_01_df": enrollment_2018_01_df,
    "enrollment_2018_02_df": enrollment_2018_02_df,
    "enrollment_2018_03_df": enrollment_2018_03_df,
    "enrollment_2018_04_df": enrollment_2018_04_df,
    "enrollment_2018_05_df": enrollment_2018_05_df,
    "enrollment_2018_06_df": enrollment_2018_06_df,
    "enrollment_2018_07_df": enrollment_2018_07_df,
    "enrollment_2018_09_df": enrollment_2018_09_df,
    "enrollment_2018_10_df": enrollment_2018_10_df,
    "enrollment_2018_11_df": enrollment_2018_11_df,
    "enrollment_2018_12_df": enrollment_2018_12_df,
    "enrollment_2019_01_df": enrollment_2019_01_df,
    "enrollment_2019_02_df": enrollment_2019_02_df,
    "enrollment_2019_03_df": enrollment_2019_03_df,
    "enrollment_2019_04_df": enrollment_2019_04_df,
    "enrollment_2019_05_df": enrollment_2019_05_df,
    "enrollment_2019_06_df": enrollment_2019_06_df,
    "enrollment_2019_07_df": enrollment_2019_07_df,
    "enrollment_2019_08_df": enrollment_2019_08_df,
    "enrollment_2019_09_df": enrollment_2019_09_df,
    "enrollment_2019_10_df": enrollment_2019_10_df,
    "enrollment_2019_11_df": enrollment_2019_11_df,
    "enrollment_2019_12_df": enrollment_2019_12_df,
    "enrollment_2020_01_df": enrollment_2020_01_df,
    "enrollment_2020_02_df": enrollment_2020_02_df,
    "enrollment_2020_03_df": enrollment_2020_03_df,
    "enrollment_2020_04_df": enrollment_2020_04_df,
    "enrollment_2020_05_df": enrollment_2020_05_df,
    "enrollment_2020_06_df": enrollment_2020_06_df,
    "enrollment_2020_07_df": enrollment_2020_07_df,
    "enrollment_2020_08_df": enrollment_2020_08_df,
    "enrollment_2020_09_df": enrollment_2020_09_df,
    "enrollment_2020_10_df": enrollment_2020_10_df,
    "enrollment_2020_11_df": enrollment_2020_11_df,
    "enrollment_2020_12_df": enrollment_2020_12_df,
    "enrollment_2021_01_df": enrollment_2021_01_df,
    "enrollment_2021_02_df": enrollment_2021_02_df,
    "enrollment_2021_03_df": enrollment_2021_03_df,
    "enrollment_2021_04_df": enrollment_2021_04_df,
    "enrollment_2021_05_df": enrollment_2021_05_df,
    "enrollment_2021_06_df": enrollment_2021_06_df,
    "enrollment_2021_07_df": enrollment_2021_07_df,
    "enrollment_2021_08_df": enrollment_2021_08_df,
    "enrollment_2021_09_df": enrollment_2021_09_df,
    "enrollment_2021_10_df": enrollment_2021_10_df,
    "enrollment_2021_11_df": enrollment_2021_11_df,
    "enrollment_2021_12_df": enrollment_2021_12_df,
    "enrollment_2022_01_df": enrollment_2022_01_df,
    "enrollment_2022_02_df": enrollment_2022_02_df,
    "enrollment_2022_03_df": enrollment_2022_03_df,
    "enrollment_2022_04_df": enrollment_2022_04_df,
    "enrollment_2022_05_df": enrollment_2022_05_df,
    "enrollment_2022_06_df": enrollment_2022_06_df,
    "enrollment_2022_07_df": enrollment_2022_07_df,
    "enrollment_2022_08_df": enrollment_2022_08_df,
    "enrollment_2022_09_df": enrollment_2022_09_df,
    "enrollment_2022_10_df": enrollment_2022_10_df,
    "enrollment_2022_11_df": enrollment_2022_11_df,
    "enrollment_2022_12_df": enrollment_2022_12_df,
    "enrollment_2023_01_df": enrollment_2023_01_df,
    "enrollment_2023_02_df": enrollment_2023_02_df,
    "enrollment_2023_03_df": enrollment_2023_03_df,
    "enrollment_2023_04_df": enrollment_2023_04_df,
    "enrollment_2023_05_df": enrollment_2023_05_df,
    "enrollment_2023_06_df": enrollment_2023_06_df,
    "enrollment_2023_07_df": enrollment_2023_07_df,
    "enrollment_2023_08_df": enrollment_2023_08_df,
    "enrollment_2023_09_df": enrollment_2023_09_df,
    "enrollment_2023_10_df": enrollment_2023_10_df,
    "enrollment_2023_11_df": enrollment_2023_11_df,
    "enrollment_2023_12_df": enrollment_2023_12_df,
}

# List of columns to drop
col_drop = ['SSA State County Code', 'FIPS State County Code', 'County']

# Define my output directory to confirm it exists
output_dir = "./PySpark_Combined_Enrollment_v3"
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

# Create empty DataFrame for combining all results by month/year/contract number
final_combined_df = None

# Loop through each DataFrame and drop the columns highlighted above
for file, df in enrollment_dfs.items():
    year_month = file.split("_")
    year_value, month_value = year_month[1], year_month[2]

    # Make sure 'Enrollment' is a number (not an object)
    df = df.withColumn("Enrollment", col("Enrollment").cast("double"))

    # Add columns for Year and Month so we know what data we're looking at once combined
    df = df.withColumn("Year", lit(year_value)).withColumn("Month", lit(month_value))

    # Drop defined columns
    df = df.drop(*col_drop)

    # Group by and sum enrollments
    df = df.groupby("Contract Number", "State", "Year", "Month") \
           .agg(sum("Enrollment").alias("Total Enrollments by State"))

    # Remove null values to cut back on space 
    df = df.filter(col("Total Enrollments by State").isNotNull())

    # Combine DataFrames
    final_combined_df = df if final_combined_df is None else final_combined_df.union(df)

# Write the final combined DataFrame to CSV
if final_combined_df is not None:
    final_combined_df.coalesce(1).write.mode("overwrite").option("header", "true").csv(output_dir + "/combined_enrollment")
    print(f"Saved all combined data to {output_dir}/combined_enrollment")
else:
    print("No data to save.")

Saved all combined data to ./PySpark_Combined_Enrollment_v3/combined_enrollment


In [None]:
# Stop Spark session
spark.stop()

In [None]:
finaldf = combine_enrollment(newdf1)
finaldf.to_csv("./CSVs/FinalData.csv",header=True,index=False)

NameError: name 'combine_enrollment' is not defined

## Models

### Keras Tuner

In [None]:
# Import our dependencies
import pandas as pd
import matplotlib.pyplot as plt
import sklearn as skl
import tensorflow as tf

#read in data for tuner
finaldf = pd.read_csv("./CSVs/FinalData.csv")
#drop catagorical columns
finaldf1 = finaldf.copy()
finaldf1 = finaldf1.drop(columns=["Contract Number","Organization Type","Contract Name","Organization Marketing Name","Parent Organization"])
#add dummies
finaldf1 = pd.get_dummies(finaldf1)

# Separate the y variable
y = finaldf1["Standardized Enrollment"]
# Separate the X variable, the features
X = finaldf1.drop(columns="Standardized Enrollment")

# Use sklearn to split dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

# Create scaler instance
X_scaler = skl.preprocessing.StandardScaler()

# Fit the scaler
X_scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [13]:
# Create a method that creates a new Sequential model with hyperparameter options
def create_model(hp):
    nn_model = tf.keras.models.Sequential()

    # Allow kerastuner to decide which activation function to use in hidden layers
    activation = hp.Choice('activation',['relu','tanh','sigmoid','leaky_relu'])

    # Allow kerastuner to decide number of neurons in first layer
    nn_model.add(tf.keras.layers.Dense(units=hp.Int('first_units',
        min_value=1,
        max_value=10,
        step=2), activation=activation, input_dim=417))

    # Allow kerastuner to decide number of hidden layers and neurons in hidden layers
    for i in range(hp.Int('num_layers', 1, 6)):
        nn_model.add(tf.keras.layers.Dense(units=hp.Int('units_' + str(i),
            min_value=1,
            max_value=10,
            step=2),
            activation=activation))

    nn_model.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

    # Compile the model
    nn_model.compile(loss="binary_crossentropy", optimizer='adam', metrics=["accuracy"])

    return nn_model

In [14]:
# Import the keras-tuner library
import keras_tuner as kt
tuner = kt.Hyperband(
    create_model, 
    seed=42,
    objective="val_accuracy",
    max_epochs=20,
    hyperband_iterations=2,
    directory="tune_dir",
    project_name="tensorworld",)

Reloading Tuner from tune_dir\tensorworld\tuner0.json


In [15]:
# Run the kerastuner search for best hyperparameters
tuner.search(X_train_scaled,y_train,epochs=20,validation_data=(X_test_scaled,y_test))

In [16]:
# Get best model hyperparameters
best_hyper = tuner.get_best_hyperparameters(1)[0]
best_hyper.values

{'activation': 'tanh',
 'first_units': 1,
 'num_layers': 4,
 'units_0': 1,
 'units_1': 9,
 'units_2': 9,
 'units_3': 3,
 'units_4': 7,
 'units_5': 7,
 'tuner/epochs': 20,
 'tuner/initial_epoch': 0,
 'tuner/bracket': 0,
 'tuner/round': 0}

In [17]:
# Evaluate best model against full test data
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

best_model = tuner.get_best_models(1)[0]
model_loss, model_accuracy = best_model.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

35/35 - 0s - 8ms/step - accuracy: 0.7265 - loss: 0.2175
Loss: 0.21748434007167816, Accuracy: 0.7265343070030212


In [18]:
#save model
best_model.save("./Models/BestKerasModel.h5")



### Random Forest

In [None]:
#read in data to run model against
finaldf = pd.read_csv("./CSVs/FinalData.csv")

In [None]:
#drop catagorical columns
finaldf1 = finaldf.copy()
finaldf1 = finaldf1.drop(columns=["Contract Number","Organization Type","Contract Name","Organization Marketing Name","Parent Organization"])
finaldf1 = pd.get_dummies(finaldf1)

In [None]:
# Separate the y variable
y = finaldf1["Standardized Enrollment"]
# Separate the X variable, the features
X = finaldf1.drop(columns="Standardized Enrollment")

In [None]:
# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)

In [None]:
# Create the random forest classifier instance
rf_model = RandomForestClassifier(n_estimators=500, random_state=78)

In [None]:
# Fit the model
rf_model = rf_model.fit(X_train, y_train.ravel())

  rf_model = rf_model.fit(X_train, y_train.ravel())


In [None]:
#create predictions
predictions = rf_model.predict(X_test)

In [None]:
#test accuracy of the model
acc_score = accuracy_score(y_test, predictions)
print(f"Accuracy Score : {acc_score}")

Accuracy Score : 0.7102888086642599


In [None]:
#find importance of features
importances = rf_model.feature_importances_
#order features based on importance
importances_sorted = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)
importances_sorted[:10]

[(0.027039949114824356,
  'C Osteoporosis Management in Women who had a Fracture'),
 (0.02527450134872629, 'C Rheumatoid Arthritis Management'),
 (0.02231341374282398, 'D Members Choosing to Leave the Plan'),
 (0.018856959507036974, 'D Appeals Upheld'),
 (0.0185580470284319, 'C Reviewing Appeals Decisions'),
 (0.017216119381393977,
  'C Osteoporosis Management in Women who had a Fracture Non-numeric_Numeric'),
 (0.015732643930078432, 'D Getting Needed Prescription Drugs'),
 (0.01521858061400955, 'DD3: Member Experience with the Drug Plan'),
 (0.015171489728994788, 'HD5: Health Plan Customer Service'),
 (0.01516747825831993, 'C Care Coordination')]

In [None]:
#create dataframe of importances
importancedf = pd.DataFrame(importances_sorted)
#label columns
importancedf = importancedf.rename(columns={0:"Weight",1:"Measurement"})
importancedf.head(10)

Unnamed: 0,Weight,Measurement
0,0.02704,C Osteoporosis Management in Women who had a F...
1,0.025275,C Rheumatoid Arthritis Management
2,0.022313,D Members Choosing to Leave the Plan
3,0.018857,D Appeals Upheld
4,0.018558,C Reviewing Appeals Decisions
5,0.017216,C Osteoporosis Management in Women who had a F...
6,0.015733,D Getting Needed Prescription Drugs
7,0.015219,DD3: Member Experience with the Drug Plan
8,0.015171,HD5: Health Plan Customer Service
9,0.015167,C Care Coordination


In [None]:
joblib.dump(rf_model,"./Models/RandomForestEnrollment.joblib")

['./Models/RandomForestEnrollment.joblib']

In [None]:
#export wieght of columns
importancedf.to_csv("./CSVs/RandomforestWeights.csv",header=True,index=False)

### Deep Learning Models

In [None]:
# Read the csv in for data set
combined_data_set = pd.read_csv('./CSVs/FinalData.csv')
combined_data_set.head(26)

In [None]:
# Dropping columns: 'Contract Number', 'Organization Type', 'Contract Name', 'Organization Marketing Name', 'Parent Organization'.
combined_data_cleaned_df = combined_data_set.drop(columns = ['Contract Number', 'Organization Type', 'Contract Name', 'Organization Marketing Name', 'Parent Organization'])
combined_data_cleaned_df

Unnamed: 0,Year,C Breast Cancer Screening,C Breast Cancer Screening Non-numeric,C Colorectal Cancer Screening,C Colorectal Cancer Screening Non-numeric,C Cardiovascular Care – Cholesterol Screening,C Cardiovascular Care – Cholesterol Screening Non-numeric,C Diabetes Care – Cholesterol Screening,C Diabetes Care – Cholesterol Screening Non-numeric,C Glaucoma Testing,...,D Statin Use in Persons with Diabetes (SUPD) Non-numeric,C Controlling High Blood Pressure,C Controlling High Blood Pressure Non-numeric,C Transitions of Care,C Transitions of Care Non-numeric,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,Overall,Overall Non-numeric,Standardized Enrollment
0,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
1,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
2,2014,2.0,Numeric,2.0,Numeric,4.0,Numeric,4.0,Numeric,2.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,1
3,2014,3.0,Numeric,4.0,Numeric,5.0,Numeric,4.0,Numeric,2.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,2
4,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2
4426,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1
4427,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Plan too new to be measured,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1
4428,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2


In [None]:
# Finding the number of unique values in each column.
combined_data_cleaned_df.nunique()

Unnamed: 0,0
Year,7
C Breast Cancer Screening,6
C Breast Cancer Screening Non-numeric,5
C Colorectal Cancer Screening,6
C Colorectal Cancer Screening Non-numeric,5
...,...
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,1
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,1
Overall,8
Overall Non-numeric,4


In [None]:
# Checking and object(s) in the data sets
categorical_data = combined_data_cleaned_df.dtypes[combined_data_cleaned_df.dtypes == 'object'].index.tolist()
categorical_data

['C Breast Cancer Screening Non-numeric',
 'C Colorectal Cancer Screening Non-numeric',
 'C Cardiovascular Care – Cholesterol Screening Non-numeric',
 'C Diabetes Care – Cholesterol Screening Non-numeric',
 'C Glaucoma Testing Non-numeric',
 'C Annual Flu Vaccine Non-numeric',
 'C Improving or Maintaining Physical Health Non-numeric',
 'C Improving or Maintaining Mental Health Non-numeric',
 'C Monitoring Physical Activity Non-numeric',
 'C Adult BMI Assessment Non-numeric',
 'C Care for Older Adults – Medication Review Non-numeric',
 'C Care for Older Adults – Functional Status Assessment Non-numeric',
 'C Care for Older Adults – Pain Screening Non-numeric',
 'C Osteoporosis Management in Women who had a Fracture Non-numeric',
 'C Diabetes Care – Eye Exam Non-numeric',
 'C Diabetes Care – Kidney Disease Monitoring Non-numeric',
 'C Diabetes Care – Blood Sugar Controlled Non-numeric',
 'C Diabetes Care – Cholesterol Controlled Non-numeric',
 'C Controlling Blood Pressure Non-numeric',


In [None]:
# Confirming the dtypes
combined_data_cleaned_df.dtypes

Unnamed: 0,0
Year,int64
C Breast Cancer Screening,float64
C Breast Cancer Screening Non-numeric,object
C Colorectal Cancer Screening,float64
C Colorectal Cancer Screening Non-numeric,object
...,...
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,float64
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,object
Overall,float64
Overall Non-numeric,object


In [None]:
# Subtracting 1 value from the 'Standardized Enrollment'
combined_data_cleaned_df['Standardized Enrollment'] = combined_data_cleaned_df['Standardized Enrollment'] -1
combined_data_cleaned_df

Unnamed: 0,Year,C Breast Cancer Screening,C Breast Cancer Screening Non-numeric,C Colorectal Cancer Screening,C Colorectal Cancer Screening Non-numeric,C Cardiovascular Care – Cholesterol Screening,C Cardiovascular Care – Cholesterol Screening Non-numeric,C Diabetes Care – Cholesterol Screening,C Diabetes Care – Cholesterol Screening Non-numeric,C Glaucoma Testing,...,D Statin Use in Persons with Diabetes (SUPD) Non-numeric,C Controlling High Blood Pressure,C Controlling High Blood Pressure Non-numeric,C Transitions of Care,C Transitions of Care Non-numeric,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,Overall,Overall Non-numeric,Standardized Enrollment
0,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,0
1,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,0
2,2014,2.0,Numeric,2.0,Numeric,4.0,Numeric,4.0,Numeric,2.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,0
3,2014,3.0,Numeric,4.0,Numeric,5.0,Numeric,4.0,Numeric,2.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,1
4,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1
4426,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,0
4427,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Plan too new to be measured,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,0
4428,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,0.0,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1


In [None]:
# Convert categorical data to numeric with `pd.get_dummies`dd
cleaned_dummies_df = pd.get_dummies(combined_data_cleaned_df)
cleaned_dummies_df

Unnamed: 0,Year,C Breast Cancer Screening,C Colorectal Cancer Screening,C Cardiovascular Care – Cholesterol Screening,C Diabetes Care – Cholesterol Screening,C Glaucoma Testing,C Annual Flu Vaccine,C Improving or Maintaining Physical Health,C Improving or Maintaining Mental Health,C Monitoring Physical Activity,...,D Statin Use in Persons with Diabetes (SUPD) Non-numeric_Numeric,D Statin Use in Persons with Diabetes (SUPD) Non-numeric_Plan not required to report measure,D Statin Use in Persons with Diabetes (SUPD) Non-numeric_Plan too new to be measured,C Controlling High Blood Pressure Non-numeric_Numeric,C Transitions of Care Non-numeric_Numeric,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric_Numeric,Overall Non-numeric_Not Applicable,Overall Non-numeric_Not enough data available,Overall Non-numeric_Numeric,Overall Non-numeric_Plan too new to be measured
0,2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,True,True,True,False,False,False,True
1,2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,True,True,True,False,False,False,True
2,2014,2.0,2.0,4.0,4.0,2.0,2.0,0.0,0.0,2.0,...,True,False,False,True,True,True,False,False,True,False
3,2014,3.0,4.0,5.0,4.0,2.0,3.0,4.0,2.0,2.0,...,True,False,False,True,True,True,False,False,True,False
4,2014,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,True,True,True,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,True,True,True,True,False,False,False
4426,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,True,True,True,True,False,False,False
4427,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,False,False,True,True,True,True,True,False,False,False
4428,2020,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,True,False,False,True,True,True,True,False,False,False


In [None]:
# Seeing each column after get.dummies
cleaned_dummies_df.columns

Index(['Year', 'C Breast Cancer Screening', 'C Colorectal Cancer Screening',
       'C Cardiovascular Care – Cholesterol Screening',
       'C Diabetes Care – Cholesterol Screening', 'C Glaucoma Testing',
       'C Annual Flu Vaccine', 'C Improving or Maintaining Physical Health',
       'C Improving or Maintaining Mental Health',
       'C Monitoring Physical Activity',
       ...
       'D Statin Use in Persons with Diabetes (SUPD) Non-numeric_Numeric',
       'D Statin Use in Persons with Diabetes (SUPD) Non-numeric_Plan not required to report measure',
       'D Statin Use in Persons with Diabetes (SUPD) Non-numeric_Plan too new to be measured',
       'C Controlling High Blood Pressure Non-numeric_Numeric',
       'C Transitions of Care Non-numeric_Numeric',
       'C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric_Numeric',
       'Overall Non-numeric_Not Applicable',
       'Overall Non-numeric_Not enough data availabl

In [None]:
# Split our preprocessed data into our features and target arrays
X = cleaned_dummies_df.drop(['Standardized Enrollment'], axis = 'columns').values
y = cleaned_dummies_df['Standardized Enrollment'].values

# Split the preprocessed data into a training and testing dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 15)

In [None]:
# Checking X_train
X_train

array([[2018, 4.0, 4.0, ..., False, True, False],
       [2019, 0.0, 0.0, ..., False, False, True],
       [2020, 3.0, 3.0, ..., False, True, False],
       ...,
       [2017, 5.0, 4.0, ..., False, False, False],
       [2018, 3.0, 3.0, ..., False, True, False],
       [2020, 4.0, 4.0, ..., False, True, False]], dtype=object)

In [None]:
# Checking X_test
X_test

array([[2018, 3.0, 3.0, ..., False, True, False],
       [2017, 0.0, 3.0, ..., False, True, False],
       [2020, 4.0, 3.0, ..., False, True, False],
       ...,
       [2018, 0.0, 0.0, ..., False, False, True],
       [2020, 3.0, 3.0, ..., False, True, False],
       [2020, 5.0, 4.0, ..., False, True, False]], dtype=object)

In [None]:
# Making sure of y_train dtypes
y_train.dtype

dtype('int64')

In [None]:
# Making sure of y_test dtypes
y_test.dtype

dtype('int64')

Training and Executing Model

In [None]:
# Create a StandardScaler instances
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Looking to see the input values for X_train
input_values = len(X_train[0])
input_values

417

In [None]:
# Attempt No. 1: Accuracy 72%
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_values = len(X_train[0])
nodes_hidden_layer1 = 75
nodes_hidden_layer2 = 50
nodes_hidden_layer3 = 25
nodes_hidden_layer4 = 10

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer1, input_dim = input_values, activation = 'relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer2, activation = 'relu'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer3, activation = 'tanh'))

# Fourth hidden layer
#nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer4, activation = 'tanh'))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))


# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Train the model
training_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6626 - loss: 0.4213
Epoch 2/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7155 - loss: 0.1619
Epoch 3/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7119 - loss: 0.0233
Epoch 4/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7042 - loss: 0.0226
Epoch 5/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7287 - loss: -0.0315
Epoch 6/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7425 - loss: -0.1879
Epoch 7/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7101 - loss: -0.2531
Epoch 8/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7262 - loss: -0.3561
Epoch 9/100
[1m104/104[0m 

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

35/35 - 0s - 7ms/step - accuracy: 0.7229 - loss: -5.8462e+00
Loss: -5.846236228942871, Accuracy: 0.7229241728782654


In [None]:
# Attempt No. 2: Accuracy 72%
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_values = len(X_train[0])
nodes_hidden_layer1 = 80
nodes_hidden_layer2 = 45
nodes_hidden_layer3 = 20
nodes_hidden_layer4 = 5

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer1, input_dim = input_values, activation = 'relu'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer2, activation = 'tanh'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer3, activation = 'tanh'))

# Fourth hidden layer
#nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer4, activation = 'tanh'))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))


# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Train the model
training_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6609 - loss: 0.3546
Epoch 2/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7231 - loss: 0.2062
Epoch 3/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7281 - loss: 0.1042
Epoch 4/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7143 - loss: 0.0245
Epoch 5/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7208 - loss: -0.0384
Epoch 6/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7358 - loss: -0.0503
Epoch 7/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7166 - loss: -0.1709
Epoch 8/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7265 - loss: -0.2383
Epoch 9/100
[1m104/104[0m 

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

35/35 - 0s - 7ms/step - accuracy: 0.7202 - loss: -4.2634e+00
Loss: -4.263357639312744, Accuracy: 0.7202166318893433


In [None]:
# Attempt No. 3: Accuracy 72%
# Define the model - deep neural net, i.e., the number of input features and hidden nodes for each layer.
input_values = len(X_train[0])
nodes_hidden_layer1 = 90
nodes_hidden_layer2 = 65
nodes_hidden_layer3 = 30
nodes_hidden_layer4 = 13

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer1, input_dim = input_values, activation = 'sigmoid'))

# Second hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer2, activation = 'tanh'))

# Third hidden layer
nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer3, activation = 'tanh'))

# Fourth hidden layer
#nn.add(tf.keras.layers.Dense(units = nodes_hidden_layer4, activation = 'tanh'))

# Output layer
nn.add(tf.keras.layers.Dense(units = 1, activation = 'sigmoid'))


# Check the structure of the model
nn.summary()

In [None]:
# Compile the model
nn.compile(loss = 'binary_crossentropy', optimizer = 'adam', metrics = ['accuracy'])

In [None]:
# Train the model
training_model = nn.fit(X_train_scaled, y_train, epochs=100)

Epoch 1/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 2ms/step - accuracy: 0.6452 - loss: 0.3930
Epoch 2/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7172 - loss: 0.1343
Epoch 3/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7110 - loss: -0.0136
Epoch 4/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.7020 - loss: -0.0469
Epoch 5/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7185 - loss: -0.0936
Epoch 6/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step - accuracy: 0.7020 - loss: -0.1890
Epoch 7/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7239 - loss: -0.3624
Epoch 8/100
[1m104/104[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7287 - loss: -0.6120
Epoch 9/100
[1m104/104[0

In [None]:
# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

35/35 - 0s - 7ms/step - accuracy: 0.7301 - loss: -6.2224e+00
Loss: -6.2224016189575195, Accuracy: 0.7301443815231323


In [None]:
# Export our model to HDF5 file
nn.save('Project4_medical_stars_colab.keras')

#### Run 2 Linear Regression Models:
#### The first using 'Overall' as the target variable in order to determine accuracy of predicting plan quality (RESULTS: 98% Accuracy)
#### The second using 'Standardized Enrollment' as the target variable in order to determine accuracy of predicting enrollment success

In [None]:
# Read csvs
enrollment_df = pd.read_excel("./Data/Enrollment.xlsx")
final_data_df = pd.read_csv("./CSVs/FinalData.csv")
state_pop_df = pd.read_csv("./CSVs/StatePopulations.csv")

In [None]:
final_data_df.head()

Unnamed: 0,Contract Number,Organization Type,Contract Name,Organization Marketing Name,Parent Organization,Year,C Breast Cancer Screening,C Breast Cancer Screening Non-numeric,C Colorectal Cancer Screening,C Colorectal Cancer Screening Non-numeric,...,D Statin Use in Persons with Diabetes (SUPD) Non-numeric,C Controlling High Blood Pressure,C Controlling High Blood Pressure Non-numeric,C Transitions of Care,C Transitions of Care Non-numeric,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,Overall,Overall Non-numeric,Standardized Enrollment
0,H0022,Demo,"BUCKEYE COMMUNITY HEALTH PLAN, INC.",Buckeye Community Health Plan - MyCareOhio,Centene Corporation,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
1,H0028,Local CCP,"CHA HMO, INC.","CHA HMO, Inc.",Humana Inc.,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
2,H0084,Local CCP,CARE IMPROVEMENT PLUS OF TEXAS INSURANCE COMPANY,Care Improvement Plus,"UnitedHealth Group, Inc.",2014,2.0,Numeric,2.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,1
3,H0104,Local CCP,BLUE CROSS AND BLUE SHIELD OF ALABAMA,Blue Advantage (PPO),BlueCross BlueShield of Alabama,2014,3.0,Numeric,4.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,2
4,H0107,Local CCP,"BLUE CROSS AND BLUE SHIELD OF MONTANA, INC.",Blue Cross and Blue Shield of Montana,"BLUE CROSS AND BLUE SHIELD OF MONTANA, INC.",2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1


#### > Build logistic regression model using the 'overall' column as the target varibale in order to measure/predict plan quality
#### > Convert 'Overall' columns to binary variable for the model
#### > Select all other columns as features
#### > Convert the features into a single column
#### > Split data into test and train
#### > Train

In [None]:
# Drop the non-beneficial ID columns
final_data_df = final_data_df.drop(columns= ['Contract Number', 'Organization Type', 'Contract Name', 'Organization Marketing Name'])
final_data_df

Unnamed: 0,Parent Organization,Year,C Breast Cancer Screening,C Breast Cancer Screening Non-numeric,C Colorectal Cancer Screening,C Colorectal Cancer Screening Non-numeric,C Cardiovascular Care – Cholesterol Screening,C Cardiovascular Care – Cholesterol Screening Non-numeric,C Diabetes Care – Cholesterol Screening,C Diabetes Care – Cholesterol Screening Non-numeric,...,D Statin Use in Persons with Diabetes (SUPD) Non-numeric,C Controlling High Blood Pressure,C Controlling High Blood Pressure Non-numeric,C Transitions of Care,C Transitions of Care Non-numeric,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,Overall,Overall Non-numeric,Standardized Enrollment
0,Centene Corporation,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
1,Humana Inc.,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
2,"UnitedHealth Group, Inc.",2014,2.0,Numeric,2.0,Numeric,4.0,Numeric,4.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,1
3,BlueCross BlueShield of Alabama,2014,3.0,Numeric,4.0,Numeric,5.0,Numeric,4.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,2
4,"BLUE CROSS AND BLUE SHIELD OF MONTANA, INC.",2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,Rite Aid Corporation,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2
4426,Capital BlueCross,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1
4427,Anthem Inc.,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Plan too new to be measured,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1
4428,"UnitedHealth Group, Inc.",2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2


In [None]:
# Determine the number of unique values in each column.
final_data_df.nunique()

Parent Organization                                                                                               364
Year                                                                                                                7
C Breast Cancer Screening                                                                                           6
C Breast Cancer Screening Non-numeric                                                                               5
C Colorectal Cancer Screening                                                                                       6
                                                                                                                 ... 
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions                  1
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric      1
Overall                                                 

In [None]:
# Convert target variable in the model to binary - 'overall'
final_data_df["Overall_Binary"] = np.where(final_data_df["Overall"] >= 3, 1, 0)  # 1 if overall star rating is 3-5, 0 if rating is 0-2
final_data_df

Unnamed: 0,Parent Organization,Year,C Breast Cancer Screening,C Breast Cancer Screening Non-numeric,C Colorectal Cancer Screening,C Colorectal Cancer Screening Non-numeric,C Cardiovascular Care – Cholesterol Screening,C Cardiovascular Care – Cholesterol Screening Non-numeric,C Diabetes Care – Cholesterol Screening,C Diabetes Care – Cholesterol Screening Non-numeric,...,C Controlling High Blood Pressure,C Controlling High Blood Pressure Non-numeric,C Transitions of Care,C Transitions of Care Non-numeric,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,Overall,Overall Non-numeric,Standardized Enrollment,Overall_Binary
0,Centene Corporation,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1,0
1,Humana Inc.,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1,0
2,"UnitedHealth Group, Inc.",2014,2.0,Numeric,2.0,Numeric,4.0,Numeric,4.0,Numeric,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,1,1
3,BlueCross BlueShield of Alabama,2014,3.0,Numeric,4.0,Numeric,5.0,Numeric,4.0,Numeric,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,2,1
4,"BLUE CROSS AND BLUE SHIELD OF MONTANA, INC.",2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,Rite Aid Corporation,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2,0
4426,Capital BlueCross,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1,0
4427,Anthem Inc.,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1,0
4428,"UnitedHealth Group, Inc.",2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2,0


In [None]:
# Grab numberic columns to set as features list
feature_cols = final_data_df.select_dtypes(include=[np.number]).columns.tolist()
print(feature_cols)

['Year', 'C Breast Cancer Screening', 'C Colorectal Cancer Screening', 'C Cardiovascular Care – Cholesterol Screening', 'C Diabetes Care – Cholesterol Screening', 'C Glaucoma Testing', 'C Annual Flu Vaccine', 'C Improving or Maintaining Physical Health', 'C Improving or Maintaining Mental Health', 'C Monitoring Physical Activity', 'C Adult BMI Assessment', 'C Care for Older Adults – Medication Review', 'C Care for Older Adults – Functional Status Assessment', 'C Care for Older Adults – Pain Screening', 'C Osteoporosis Management in Women who had a Fracture', 'C Diabetes Care – Eye Exam', 'C Diabetes Care – Kidney Disease Monitoring', 'C Diabetes Care – Blood Sugar Controlled', 'C Diabetes Care – Cholesterol Controlled', 'C Controlling Blood Pressure', 'C Rheumatoid Arthritis Management', 'C Improving Bladder Control', 'C Reducing the Risk of Falling', 'C Plan All-Cause Readmissions', 'C Getting Needed Care', 'C Getting Appointments and Care Quickly', 'C Customer Service', 'C Rating of 

In [None]:
# Select featuers
feature_cols = [
'Year', 'C Breast Cancer Screening', 'C Colorectal Cancer Screening', 'C Cardiovascular Care – Cholesterol Screening', 
'C Diabetes Care – Cholesterol Screening', 'C Glaucoma Testing', 'C Annual Flu Vaccine', 'C Improving or Maintaining Physical Health', 
'C Improving or Maintaining Mental Health', 'C Monitoring Physical Activity', 'C Adult BMI Assessment', 
'C Care for Older Adults – Medication Review', 'C Care for Older Adults – Functional Status Assessment', 
'C Care for Older Adults – Pain Screening', 'C Osteoporosis Management in Women who had a Fracture', 'C Diabetes Care – Eye Exam', 
'C Diabetes Care – Kidney Disease Monitoring', 'C Diabetes Care – Blood Sugar Controlled', 'C Diabetes Care – Cholesterol Controlled', 
'C Controlling Blood Pressure', 'C Rheumatoid Arthritis Management', 'C Improving Bladder Control', 'C Reducing the Risk of Falling', 
'C Plan All-Cause Readmissions', 'C Getting Needed Care', 'C Getting Appointments and Care Quickly', 'C Customer Service', 
'C Rating of Health Care Quality', 'C Rating of Health Plan', 'C Care Coordination', 'C Complaints about the Health Plan', 
'C Beneficiary Access and Performance Problems', 'C Members Choosing to Leave the Plan', 'C Health Plan Quality Improvement', 
'C Plan Makes Timely Decisions about Appeals', 'C Reviewing Appeals Decisions', 'C Call Center – Foreign Language Interpreter and TTY Availability', 
'HD1: Staying Healthy: Screenings, Tests and Vaccines', 'HD2: Managing Chronic (Long Term) Conditions', 
'HD3: Member Experience with Health Plan', "HD4: Member Complaints, Problems Getting Services, and Improvement in the Health Plan's Performance", 
'HD5: Health Plan Customer Service', 'SNP', 'Part C Summary', 'D Call Center – Foreign Language Interpreter and TTY Availability', 
'D Appeals Auto–Forward', 'D Appeals Upheld', 'D Complaints about the Drug Plan', 'D Beneficiary Access and Performance Problems', 
'D Members Choosing to Leave the Plan', 'D Drug Plan Quality Improvement', 'D Rating of Drug Plan', 'D Getting Needed Prescription Drugs', 
'D MPF Price Accuracy', 'D High Risk Medication', 'D Diabetes Treatment', 'D Medication Adherence for Diabetes Medications ', 
'D Medication Adherence for Hypertension (RAS antagonists) ', 'D Medication Adherence for Cholesterol (Statins) ', 
'DD1: Drug Plan Customer Service', 'DD2: Member Complaints, Problems Getting Services, and Improvement in the Drug Plan’s Performance ', 
'DD3: Member Experience with the Drug Plan', 'DD4: Patient Safety and Accuracy of Drug Pricing', 'Part D Summary', 
'C Special Needs Plan (SNP) Care Management', 'C Care for Older Adults – Pain Assessment', 
"HD4: Member Complaints and Changes in the Health Plan's Performance", 'DD2: Member Complaints and Changes in the Drug Plan’s Performance', 
'DD4: Drug Safety and Accuracy of Drug Pricing', 'D Medication Adherence for Diabetes Medications', 
'D Medication Adherence for Hypertension (RAS antagonists)', 'D Medication Adherence for Cholesterol (Statins)', 
'D MTM Program Completion Rate for CMR', 'C Medication Reconciliation Post-Discharge', 
'C Statin Therapy for Patients with Cardiovascular Disease', 'D Statin Use in Persons with Diabetes (SUPD)', 
'C Controlling High Blood Pressure', 'C Transitions of Care', 'C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions', 
'Standardized Enrollment'
] 

In [None]:
# Show feature vs target variable
y = final_data_df["Overall_Binary"]
X = final_data_df[feature_cols]

In [None]:
# Use scalar to normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split data in to test and training data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Train the data using the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_probability = model.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.9876
Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.98      0.99       377
           1       0.99      0.99      0.99       509

    accuracy                           0.99       886
   macro avg       0.99      0.99      0.99       886
weighted avg       0.99      0.99      0.99       886



#### Feature Importance

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

In [None]:
# Use Random Forest in sklearn to auto calculate feature importance
importances = rf_model.feature_importances_

In [None]:
# Sort features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.13514444300615713, 'Part C Summary'),
 (0.08182285212208291, 'HD3: Member Experience with Health Plan'),
 (0.06941077746723288, 'C Diabetes Care – Blood Sugar Controlled'),
 (0.06369297281772293, 'HD1: Staying Healthy: Screenings, Tests and Vaccines'),
 (0.06000420051406293, 'C Diabetes Care – Eye Exam'),
 (0.044995219701858984, 'C Diabetes Care – Kidney Disease Monitoring'),
 (0.04345462639874311, 'C Plan All-Cause Readmissions'),
 (0.04205252176357173, 'HD2: Managing Chronic (Long Term) Conditions'),
 (0.03682647140925033, 'C Colorectal Cancer Screening'),
 (0.035777468791905805, 'C Rating of Health Plan'),
 (0.034239046791805054, 'C Complaints about the Health Plan'),
 (0.03337503538482737, 'C Rating of Health Care Quality'),
 (0.02885335999890673, 'C Adult BMI Assessment'),
 (0.02752642686606961, 'C Annual Flu Vaccine'),
 (0.02709196358731166, 'C Members Choosing to Leave the Plan'),
 (0.02701977129734807, 'C Getting Appointments and Care Quickly'),
 (0.02556667643834501, 'D Me

# Run model again, but this time using 'Standardized Enrollment' as the target variable

In [None]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt


  from pandas.core import (


In [None]:
# Read csvs
enrollment_df = pd.read_excel("./Data/Enrollment.xlsx")
final_data_df = pd.read_csv("./CSVs/FinalData.csv")

In [None]:
# Drop the non-beneficial ID columns
final_data_df = final_data_df.drop(columns= ['Contract Number', 'Organization Type', 'Contract Name', 'Organization Marketing Name'])
final_data_df

Unnamed: 0,Parent Organization,Year,C Breast Cancer Screening,C Breast Cancer Screening Non-numeric,C Colorectal Cancer Screening,C Colorectal Cancer Screening Non-numeric,C Cardiovascular Care – Cholesterol Screening,C Cardiovascular Care – Cholesterol Screening Non-numeric,C Diabetes Care – Cholesterol Screening,C Diabetes Care – Cholesterol Screening Non-numeric,...,D Statin Use in Persons with Diabetes (SUPD) Non-numeric,C Controlling High Blood Pressure,C Controlling High Blood Pressure Non-numeric,C Transitions of Care,C Transitions of Care Non-numeric,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions,C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric,Overall,Overall Non-numeric,Standardized Enrollment
0,Centene Corporation,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
1,Humana Inc.,2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
2,"UnitedHealth Group, Inc.",2014,2.0,Numeric,2.0,Numeric,4.0,Numeric,4.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,1
3,BlueCross BlueShield of Alabama,2014,3.0,Numeric,4.0,Numeric,5.0,Numeric,4.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,3.0,Numeric,2
4,"BLUE CROSS AND BLUE SHIELD OF MONTANA, INC.",2014,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,0.0,Plan too new to be measured,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Plan too new to be measured,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4425,Rite Aid Corporation,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2
4426,Capital BlueCross,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1
4427,Anthem Inc.,2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Plan too new to be measured,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,1
4428,"UnitedHealth Group, Inc.",2020,0.0,Plan not required to report measure,0.0,Plan not required to report measure,0.0,Numeric,0.0,Numeric,...,Numeric,0.0,Numeric,0.0,Numeric,0.0,Numeric,0.0,Not Applicable,2


In [None]:
# Determine the number of unique values in each column.
final_data_df.nunique()

Parent Organization                                                                                               364
Year                                                                                                                7
C Breast Cancer Screening                                                                                           6
C Breast Cancer Screening Non-numeric                                                                               5
C Colorectal Cancer Screening                                                                                       6
                                                                                                                 ... 
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions                  1
C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions Non-numeric      1
Overall                                                 

In [None]:
# Select featuers
feature_cols = ['Year', 'C Breast Cancer Screening', 'C Colorectal Cancer Screening', 'C Cardiovascular Care – Cholesterol Screening', 'C Diabetes Care – Cholesterol Screening', 'C Glaucoma Testing', 'C Annual Flu Vaccine',
                 'C Improving or Maintaining Physical Health', 'C Improving or Maintaining Mental Health', 'C Monitoring Physical Activity', 'C Adult BMI Assessment', 'C Care for Older Adults – Medication Review',
                   'C Care for Older Adults – Functional Status Assessment', 'C Care for Older Adults – Pain Screening', 'C Osteoporosis Management in Women who had a Fracture', 'C Diabetes Care – Eye Exam', 
                   'C Diabetes Care – Kidney Disease Monitoring', 'C Diabetes Care – Blood Sugar Controlled', 'C Diabetes Care – Cholesterol Controlled', 'C Controlling Blood Pressure', 'C Rheumatoid Arthritis Management',
                     'C Improving Bladder Control', 'C Reducing the Risk of Falling', 'C Plan All-Cause Readmissions', 'C Getting Needed Care', 'C Getting Appointments and Care Quickly', 'C Customer Service', 
                     'C Rating of Health Care Quality', 'C Rating of Health Plan', 'C Care Coordination', 'C Complaints about the Health Plan', 'C Beneficiary Access and Performance Problems', 'C Members Choosing to Leave the Plan',
                       'C Health Plan Quality Improvement', 'C Plan Makes Timely Decisions about Appeals', 'C Reviewing Appeals Decisions', 'C Call Center – Foreign Language Interpreter and TTY Availability',
                         'HD1: Staying Healthy: Screenings, Tests and Vaccines', 'HD2: Managing Chronic (Long Term) Conditions', 'HD3: Member Experience with Health Plan', 
                         "HD4: Member Complaints, Problems Getting Services, and Improvement in the Health Plan's Performance", 'HD5: Health Plan Customer Service', 'SNP', 'Part C Summary', 
                         'D Call Center – Foreign Language Interpreter and TTY Availability', 'D Appeals Auto–Forward', 'D Appeals Upheld', 'D Complaints about the Drug Plan', 'D Beneficiary Access and Performance Problems',
                           'D Members Choosing to Leave the Plan', 'D Drug Plan Quality Improvement', 'D Rating of Drug Plan', 'D Getting Needed Prescription Drugs', 'D MPF Price Accuracy', 'D High Risk Medication', 'D Diabetes Treatment',
                             'D Medication Adherence for Diabetes Medications ', 'D Medication Adherence for Hypertension (RAS antagonists) ', 'D Medication Adherence for Cholesterol (Statins) ', 'DD1: Drug Plan Customer Service', 
                             'DD2: Member Complaints, Problems Getting Services, and Improvement in the Drug Plan’s Performance ', 'DD3: Member Experience with the Drug Plan', 'DD4: Patient Safety and Accuracy of Drug Pricing', 
                             'Part D Summary', 'C Special Needs Plan (SNP) Care Management', 'C Care for Older Adults – Pain Assessment', "HD4: Member Complaints and Changes in the Health Plan's Performance", 
                             'DD2: Member Complaints and Changes in the Drug Plan’s Performance', 'DD4: Drug Safety and Accuracy of Drug Pricing', 'D Medication Adherence for Diabetes Medications', 
                             'D Medication Adherence for Hypertension (RAS antagonists)', 'D Medication Adherence for Cholesterol (Statins)', 'D MTM Program Completion Rate for CMR', 'C Medication Reconciliation Post-Discharge', 
                             'C Statin Therapy for Patients with Cardiovascular Disease', 'D Statin Use in Persons with Diabetes (SUPD)', 'C Controlling High Blood Pressure', 'C Transitions of Care', 
                             'C Follow-up after Emergency Department Visit for People with Multiple High-Risk Chronic Conditions', 'Overall']

In [None]:
# Show feature vs target variable
y = final_data_df["Standardized Enrollment"]
X = final_data_df[feature_cols]

In [None]:
# Use scalar to normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [None]:
# Split data in to test and training data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

In [None]:
# Train the data using the Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [None]:
# Make predictions
y_pred = model.predict(X_test)
y_pred_probability = model.predict_proba(X_test)[:, 1]

In [None]:
# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)

print(f"Model Accuracy: {accuracy:.4f}")
print("Classification Report:\n", classification_report(y_test, y_pred))

Model Accuracy: 0.7348
Classification Report:
               precision    recall  f1-score   support

           1       0.79      0.87      0.82       491
           2       0.67      0.65      0.66       336
           3       0.44      0.14      0.22        49
           4       0.20      0.10      0.13        10

    accuracy                           0.73       886
   macro avg       0.52      0.44      0.46       886
weighted avg       0.72      0.73      0.72       886



#### Feature Importance

In [None]:
# Train a Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X, y)

In [None]:
# Use Random Forest in sklearn to auto calculate feature importance
importances = rf_model.feature_importances_

In [None]:
# Sort features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.05436427777245295, 'C Rheumatoid Arthritis Management'),
 (0.039749243809387526, 'D Members Choosing to Leave the Plan'),
 (0.03694279536034697,
  'C Osteoporosis Management in Women who had a Fracture'),
 (0.028717758447888805, 'D Appeals Upheld'),
 (0.02431756155809582, 'D Rating of Drug Plan'),
 (0.023910434924680306, 'C Reviewing Appeals Decisions'),
 (0.022813288045809812, 'HD5: Health Plan Customer Service'),
 (0.02157063749542199, 'DD3: Member Experience with the Drug Plan'),
 (0.02153711094165575, 'C Plan Makes Timely Decisions about Appeals'),
 (0.020842207553340337, 'C Reducing the Risk of Falling'),
 (0.019737168763656106, 'D Getting Needed Prescription Drugs'),
 (0.0196416299756576, 'D Complaints about the Drug Plan'),
 (0.019221774099327172, 'C Rating of Health Care Quality'),
 (0.01744775222348211, 'C Diabetes Care – Blood Sugar Controlled'),
 (0.016937631929683065, 'C Improving or Maintaining Physical Health'),
 (0.01692053402204159, 'C Care Coordination'),
 (0.01681