In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import re
pd.set_option('display.max_columns', None)
import tensorflow as tf
import xml.etree.ElementTree as ET

In [2]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder,MinMaxScaler
from sklearn.model_selection import train_test_split

In [32]:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Concatenate, Flatten, Dropout, Embedding
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping

In [3]:
data_path = 'processed_data/'

In [4]:
## ratings ##
df_ratings_score = pd.read_csv(data_path + 'Ratings_Reviews.csv')
df_ratings_score = df_ratings_score[['PERSON_ID','SurveyAnswerScore', 'RecommendStar', 'ProviderCode']]
df_ratings_score=df_ratings_score.groupby(['PERSON_ID','ProviderCode']).agg({'SurveyAnswerScore' : 'mean','RecommendStar':'mean'}).reset_index()
df_ratings_score['ratings'] = df_ratings_score['SurveyAnswerScore'] * 0.5 + df_ratings_score['RecommendStar'] *0.5
df_ratings_score['ratings']=df_ratings_score['ratings'].apply(lambda x : np.round(x,2))
df_ratings_score = df_ratings_score.drop(['RecommendStar','SurveyAnswerScore'],axis=1)
df_ratings_score.columns

Index(['PERSON_ID', 'ProviderCode', 'ratings'], dtype='object')

In [5]:
##facility##
df_facility = pd.read_csv(data_path + 'Provider/facility.csv')
def extract_zip_code(address):
    # Regular expression to match 5-digit zip code
    match = re.search(r'\b\d{5}\b', address)
    if match:
        return match.group()
    return None

df_facility = df_facility[['FACILITYCODE','FACILITYNAME','FACILITYTYPECODE','ADDRESSXML']]
df_facility['zip_code'] = df_facility['ADDRESSXML'].apply(lambda x : extract_zip_code(x))
df_facility=df_facility.drop(['ADDRESSXML'],axis=1)
df_facility.columns

Index(['FACILITYCODE', 'FACILITYNAME', 'FACILITYTYPECODE', 'zip_code'], dtype='object')

In [6]:
#speciality
df_speciality = pd.read_csv(data_path + 'Provider/speciality.csv')
df_speciality = df_speciality[['SpecialtyCode','SpecialtyDescription','SpecialistDescription']]
df_speciality.columns

Index(['SpecialtyCode', 'SpecialtyDescription', 'SpecialistDescription'], dtype='object')

In [7]:
#provider facility
df_provider_facility = pd.read_csv(data_path + 'Provider/ProviderToFacility.csv')
df_provider_facility = df_provider_facility[['PROVIDERCODE','FACILITYCODE']]
df_provider_facility.columns

Index(['PROVIDERCODE', 'FACILITYCODE'], dtype='object')

In [8]:
#provider spciality
df_provider_speciality = pd.read_csv(data_path + 'Provider/ProviderToSpecailty.csv')
df_provider_speciality = df_provider_speciality[['PROVIDERCODE','SPECIALTYCODE','SPECIALTYRANKCALCULATED']]
df_provider_speciality.columns

Index(['PROVIDERCODE', 'SPECIALTYCODE', 'SPECIALTYRANKCALCULATED'], dtype='object')

In [9]:
## create segments 
# Merge the dataframes to create the desired dataframe
df_merged = df_provider_facility.merge(df_facility, on='FACILITYCODE').merge(df_provider_speciality, on='PROVIDERCODE').merge(df_speciality, left_on='SPECIALTYCODE', right_on='SpecialtyCode')
# Select the relevant columns and drop duplicates to get unique combinations
df_segments = df_merged[['zip_code', 'FACILITYCODE', 'PROVIDERCODE', 'SpecialtyCode', 'SpecialtyDescription']].drop_duplicates()
# Display the unique combinations
df_segments.columns

Index(['zip_code', 'FACILITYCODE', 'PROVIDERCODE', 'SpecialtyCode',
       'SpecialtyDescription'],
      dtype='object')

In [10]:
## load demographics ##
person_df = pd.read_csv(data_path + 'person_demographics.csv')
person_df = person_df[['PERSON_ID','age','GENDER']]
person_df['GENDER'] = person_df['GENDER'].apply(lambda x: 'Male' if pd.isnull(x) or x != 'Female' else 'Female')
person_df.columns

Index(['PERSON_ID', 'age', 'GENDER'], dtype='object')

In [11]:
person_df.GENDER.value_counts()

Male      7791
Female    2209
Name: GENDER, dtype: int64

In [12]:
provider_df = pd.read_csv(data_path + 'Provider/provider.csv')
#provider_df.columns

In [13]:
#included ratings
eligible_persons = list((set(df_ratings_score.PERSON_ID).intersection(set(person_df.PERSON_ID))))
df_ratings_fltrd = df_ratings_score[df_ratings_score['PERSON_ID'].isin(eligible_persons)]
df_ratings_fltrd.columns

Index(['PERSON_ID', 'ProviderCode', 'ratings'], dtype='object')

In [14]:
#provider ranking 
df_provider_rankings = df_ratings_score.groupby(['ProviderCode']).agg({'ratings' : 'mean'}).reset_index()
df_provider_rankings['ratings'] = np.round(df_provider_rankings['ratings'],2)
df_provider_rankings = df_provider_rankings.rename(columns={'ratings': 'sat_score'})

In [15]:
df_provider_rankings.columns

Index(['ProviderCode', 'sat_score'], dtype='object')

### Feature engineering starts here ###

In [16]:
## this is person demographics
person_df= person_df[person_df['PERSON_ID'].isin(eligible_persons)]
person_df = person_df.sort_values(by=['PERSON_ID', 'GENDER', 'age'], ascending=[True, True, False])
person_df = person_df.drop_duplicates(subset='PERSON_ID', keep='first')
person_df.head(1)

Unnamed: 0,PERSON_ID,age,GENDER
1190,00123db9a36e9f9277d6ce32c3fdd1e49809845399741b...,39.0,Female


In [17]:
#this is ratings_df for person to provider 
df_ratings_score.head(1)

Unnamed: 0,PERSON_ID,ProviderCode,ratings
0,00034bd1b4be964221e56bfc363834d832d93316030e98...,Y9SFHDZ,4.78


In [18]:
#this is facility 
df_facility.head(1)

Unnamed: 0,FACILITYCODE,FACILITYNAME,FACILITYTYPECODE,zip_code
0,442F0F,Hackensack University Medical Center,STAC,7601


In [19]:
#this is provider to facility relation
df_provider_facility.head(1)

Unnamed: 0,PROVIDERCODE,FACILITYCODE
0,YVMCR,100360


In [20]:
# this is provider to speciality
df_provider_speciality.head(1)

Unnamed: 0,PROVIDERCODE,SPECIALTYCODE,SPECIALTYRANKCALCULATED
0,YRSQT,PS127,1


In [21]:
#this is provider rankings 
df_provider_rankings.head(1)

Unnamed: 0,ProviderCode,sat_score
0,0RZSMOK314,4.58


In [22]:
## this is segments ##
df_segments = df_segments.rename(columns={'PROVIDERCODE': 'ProviderCode'})
df_segments.head(1)

Unnamed: 0,zip_code,FACILITYCODE,ProviderCode,SpecialtyCode,SpecialtyDescription
0,33328,100360,YVMCR,PS628,Orthopedic Spine Surgery


In [23]:
ratings_df  = df_ratings_score
provider_rankings_df  = df_provider_rankings
segments_df  = df_segments
facility_df  = df_facility
provider_facility_df = df_provider_facility
provider_specialty_df = df_provider_speciality

### Modelling for Features ###

In [24]:
interaction_df = pd.merge(ratings_df, person_df, on='PERSON_ID')
interaction_df = pd.merge(interaction_df, provider_rankings_df, on='ProviderCode')
interaction_df = pd.merge(interaction_df, segments_df, on='ProviderCode')

In [25]:
# Normalize numerical features
scaler = MinMaxScaler()
interaction_df[['age', 'sat_score','ratings']] = scaler.fit_transform(interaction_df[['age', 'sat_score','ratings']])

# One-hot encode gender
interaction_df = pd.concat([interaction_df, pd.get_dummies(interaction_df['GENDER'], prefix='GENDER')], axis=1)
interaction_df = interaction_df.drop(columns=['GENDER'])

# Convert specialties to numeric values for embedding
interaction_df['SpecialtyDescription'] = interaction_df['SpecialtyDescription'].astype('category').cat.codes

In [26]:
interaction_df= interaction_df.drop_duplicates()

In [27]:
interaction_df

Unnamed: 0,PERSON_ID,ProviderCode,ratings,age,sat_score,zip_code,FACILITYCODE,SpecialtyCode,SpecialtyDescription,GENDER_Female,GENDER_Male
0,00123db9a36e9f9277d6ce32c3fdd1e49809845399741b...,XF6BG,0.705882,0.324074,0.370370,34950,81DB7D,PS305,23,1,0
1,32e8750a2e31a34c71d700df32a773adcab8db3109117b...,XF6BG,0.964706,0.324074,0.370370,34950,81DB7D,PS305,23,0,1
2,4a2bfb67462d6f2d7b70752411114edc1325c8d9efb885...,XF6BG,0.929412,0.324074,0.370370,34950,81DB7D,PS305,23,1,0
3,c5a78302c3051b68f31cf426eff4f1b90b324d44ecdbb2...,XF6BG,0.536471,0.324074,0.370370,34950,81DB7D,PS305,23,0,1
4,df0ce9bc392f26eb5fc52244d195b5f6ab9ce6d14ddaf5...,XF6BG,0.327059,0.324074,0.370370,34950,81DB7D,PS305,23,1,0
...,...,...,...,...,...,...,...,...,...,...,...
8518,fed15a2b14775e348e8f0c8cc28fe7894dd1663905884c...,3LSH3,0.891765,0.324074,0.753086,13001,420826,PS779,68,0,1
8519,fed15a2b14775e348e8f0c8cc28fe7894dd1663905884c...,3LSH3,0.891765,0.324074,0.753086,33462,514CDA,PS780,69,0,1
8520,fed15a2b14775e348e8f0c8cc28fe7894dd1663905884c...,3LSH3,0.891765,0.324074,0.753086,13001,420826,PS780,69,0,1
8521,f6b0c65ce633cc62b20826bc4a89a70e3a819b2dea5810...,YRJXK,0.948235,0.620370,0.954733,11750,9FECE7,PS412,37,0,1


In [28]:
X = interaction_df[['age', 'sat_score', 'GENDER_Female', 'GENDER_Male', 'SpecialtyDescription']].values
y = interaction_df['ratings'].values


In [29]:
unique_users = interaction_df['PERSON_ID'].unique()
train_users, test_users = train_test_split(unique_users, test_size=0.2, random_state=42)

In [30]:
train_mask = interaction_df['PERSON_ID'].isin(train_users)
test_mask = interaction_df['PERSON_ID'].isin(test_users)

In [31]:
X_train = X[train_mask];y_train = y[train_mask]
X_test = X[test_mask];y_test = y[test_mask]

In [None]:
def create_deep_and_cross_model():
    inputs = create_model_inputs()
    x0 = encode_inputs(inputs, use_embedding=True)

    cross = x0
    for _ in hidden_units:
        units = cross.shape[-1]
        x = layers.Dense(units)(cross)
        cross = x0 * x + cross
    cross = layers.BatchNormalization()(cross)

    deep = x0
    for units in hidden_units:
        deep = layers.Dense(units)(deep)
        deep = layers.BatchNormalization()(deep)
        deep = layers.ReLU()(deep)
        deep = layers.Dropout(dropout_rate)(deep)

    merged = layers.concatenate([cross, deep])
    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(merged)
    model = keras.Model(inputs=inputs, outputs=outputs)
    return model

In [None]:
def create_dcn_model(numeric_features_dim, embedding_input_dim, embedding_output_dim):
    # Numeric input
    numeric_input = Input(shape=(numeric_features_dim,), name='numeric_input')

    # Embedding input
    embedding_input = Input(shape=(1,), name='embedding_input')
    embedding_layer = Embedding(input_dim=embedding_input_dim, output_dim=embedding_output_dim, input_length=1)(embedding_input)
    embedding_layer = Flatten()(embedding_layer)

    # Concatenate numeric and embedding layers
    concatenated_input = Concatenate()([numeric_input, embedding_layer])

    # Cross network
    x = concatenated_input
    for _ in range(2):  # Number of cross layers
        x = Dense(concatenated_input.shape[1], activation='relu')(x) + concatenated_input

    # Deep network
    x = Dense(128, activation='relu')(x)
    x = Dropout(0.3)(x)
    x = Dense(64, activation='relu')(x)

    # Output layer
    output = Dense(1, activation='linear')(x)

    model = Model(inputs=[numeric_input, embedding_input], outputs=output)
    model.compile(optimizer='adam', loss='mse', metrics=['mae'])

    return model