In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial

In [2]:
data1 = pd.read_csv("dataset.csv") #diseases
data2 = pd.read_csv("Symptom-severity.csv") #severity level
data3 = pd.read_csv("Hospital-General-Information.csv") #hospital

In [3]:
cols=data1.columns
data1=data1[cols].values.flatten()
data1=pd.Series(data1).str.strip()

In [4]:
symptoms=data2['Symptom'].unique()

In [5]:
data2

Unnamed: 0,Symptom,weight
0,itching,1
1,skin_rash,3
2,nodal_skin_eruptions,4
3,continuous_sneezing,4
4,shivering,5
...,...,...
128,inflammatory_nails,2
129,blister,4
130,red_sore_around_nose,2
131,yellow_crust_ooze,3


In [6]:
#preparing the dataset

arr=[]

#changing the values based on the severity of the disease
for i in data1:
    if i in symptoms: #if the symptom's severity is recorded
        index=symptoms.tolist().index(i)
        arr.append(data2.iloc[index]['weight'])
    else: # if the symptom's severity is not recorded
        arr.append(i)

newSeries=pd.Series(arr) #create new series
newSeries=newSeries.values.reshape((4920,18)) #reshape the series based on the initial dataframe's shape
newData1 = pd.DataFrame(newSeries, columns=cols) #convert series to dataframe

newData1 = newData1.fillna(0) #fill empty values with 0

#diseases without severity is replaced by 0
newData1 = newData1.replace('dischromic _patches',0)
newData1 = newData1.replace('spotting_ urination',0)
newData1 = newData1.replace('foul_smell_of urine',0)

In [7]:
newData1

Unnamed: 0,Disease,Symptom_1,Symptom_2,Symptom_3,Symptom_4,Symptom_5,Symptom_6,Symptom_7,Symptom_8,Symptom_9,Symptom_10,Symptom_11,Symptom_12,Symptom_13,Symptom_14,Symptom_15,Symptom_16,Symptom_17
0,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,Fungal infection,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Fungal infection,1,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Fungal infection,1,3,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Fungal infection,1,3,4,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4915,(vertigo) Paroymsal Positional Vertigo,5,3,5,6,4,4,0,0,0,0,0,0,0,0,0,0,0
4916,Acne,3,2,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0
4917,Urinary tract infection,6,4,0,6,0,0,0,0,0,0,0,0,0,0,0,0,0
4918,Psoriasis,3,3,2,3,2,2,0,0,0,0,0,0,0,0,0,0,0


In [8]:
cols = list(newData1.columns) #getting columns

#assignning values for X and y
X = newData1[cols[1:]]
y = newData1[cols[0]]

#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [9]:
#random forest classifier is used to determine the disease and severity of it
randomForest = RandomForestClassifier()
randomForest.fit(X_train, y_train)
prediction = randomForest.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, prediction))

Accuracy: 0.9943089430894309


In [10]:
symp = []
sympWeight = []
sympCount = 18
while sympCount > 17:
    sympCount = int(input('Number of Symptom/s: '))

print("Enter the symptoms below")
for x in range(sympCount):
    userinput = input('Symptom ' + str(x+1) + ': ')
    userinput = userinput.replace(" ", "_")
    symp.append(userinput)

for i in symp:
    if i in symptoms: #if the symptom's severity is recorded
        index=symptoms.tolist().index(i)
        sympWeight.append(data2.iloc[index]['weight'])
    else: # if the symptom's severity is not recorded
        sympWeight.append(i)

if(len(sympWeight)<17):
    while(len(sympWeight)<17):
        sympWeight.append(0)

Number of Symptom/s: 3
Enter the symptoms below
Symptom 1: itching
Symptom 2: acidity
Symptom 3: chills


In [11]:
#predicting the disease and severity
disease = randomForest.predict([sympWeight])
sev = sum(sympWeight)

print("Disease is " + disease[0] + " with the severity level of " + str(sev))

Disease is Fungal infection with the severity level of 7


In [12]:
data3.head()

Unnamed: 0,Hospital Name,City,Hospital Type,Hospital Ownership,Emergency Services,Hospital overall rating,Mortality national comparison,Safety of care national comparison,Readmission national comparison,Patient experience national comparison,Effectiveness of care national comparison,Timeliness of care national comparison,Efficient use of medical imaging national comparison
0,SOUTHEAST ALABAMA MEDICAL CENTER,DOTHAN,Acute Care Hospitals,Government - Hospital District or Authority,Yes,3,Same as the national average,Above the national average,Same as the national average,Below the national average,Same as the national average,Same as the national average,Same as the national average
1,MARSHALL MEDICAL CENTER SOUTH,BOAZ,Acute Care Hospitals,Government - Hospital District or Authority,Yes,3,Below the national average,Same as the national average,Above the national average,Same as the national average,Same as the national average,Above the national average,Below the national average
2,ELIZA COFFEE MEMORIAL HOSPITAL,FLORENCE,Acute Care Hospitals,Government - Hospital District or Authority,Yes,2,Below the national average,Same as the national average,Same as the national average,Below the national average,Same as the national average,Above the national average,Same as the national average
3,MIZELL MEMORIAL HOSPITAL,OPP,Acute Care Hospitals,Voluntary non-profit - Private,Yes,2,Same as the national average,Not Available,Below the national average,Same as the national average,Below the national average,Above the national average,Not Available
4,CRENSHAW COMMUNITY HOSPITAL,LUVERNE,Acute Care Hospitals,Proprietary,Yes,3,Same as the national average,Not Available,Same as the national average,Not Available,Same as the national average,Above the national average,Not Available


In [13]:
hospitalName = data3['Hospital Name'].values.tolist()
hospitalName

['SOUTHEAST ALABAMA MEDICAL CENTER',
 'MARSHALL MEDICAL CENTER SOUTH',
 'ELIZA COFFEE MEMORIAL HOSPITAL',
 'MIZELL MEMORIAL HOSPITAL',
 'CRENSHAW COMMUNITY HOSPITAL',
 "ST VINCENT'S EAST",
 'DEKALB REGIONAL MEDICAL CENTER',
 'SHELBY BAPTIST MEDICAL CENTER',
 'CALLAHAN EYE HOSPITAL',
 'HELEN KELLER MEMORIAL HOSPITAL',
 'DALE MEDICAL CENTER',
 'CHEROKEE MEDICAL CENTER',
 'BAPTIST MEDICAL CENTER SOUTH',
 'JACKSON HOSPITAL & CLINIC INC',
 'EAST ALABAMA MEDICAL CENTER',
 'WEDOWEE HOSPITAL',
 'UNIVERSITY OF ALABAMA HOSPITAL',
 'COMMUNITY HOSPITAL INC',
 'CULLMAN REGIONAL MEDICAL CENTER',
 'ANDALUSIA REGIONAL HOSPITAL',
 'STRINGFELLOW MEMORIAL HOSPITAL',
 'HUNTSVILLE HOSPITAL',
 'GADSDEN REGIONAL MEDICAL CENTER',
 'MARION REGIONAL MEDICAL CENTER',
 'FAYETTE MEDICAL CENTER',
 'RIVERVIEW REGIONAL MEDICAL CENTER',
 'GEORGIANA MEDICAL CENTER',
 'MEDICAL CENTER ENTERPRISE',
 'GREENE COUNTY HOSPITAL',
 'LAKE MARTIN COMMUNITY HOSPITAL',
 'FLOWERS HOSPITAL',
 "ST VINCENT'S BIRMINGHAM",
 'BIBB MEDICAL

In [14]:
rating = data3['Hospital overall rating']
rating.unique()

array(['3', '2', 'Not Available', '4', '5', '1'], dtype=object)

In [15]:
def rate(i):
    if(i=="Not Available"):
        return '0'
    else:
        return i
        
newRating = []
for i in rating:
    newRating.append(rate(i))
print(set(newRating))

{'1', '4', '3', '2', '5', '0'}


In [16]:
emergency = data3['Emergency Services']
emergency.unique()

array(['Yes', 'No'], dtype=object)

In [17]:
def emrgncy(i):
    if(i=='Yes'):
        return 1
    else: 
        return 0    

newEmergency = []
for i in emergency:
    newEmergency.append(emrgncy(i))
print(set(newEmergency))

{0, 1}


In [18]:
ownership = data3['Hospital Ownership']
ownership.unique()

array(['Government - Hospital District or Authority',
       'Voluntary non-profit - Private', 'Proprietary',
       'Government - State', 'Voluntary non-profit - Other',
       'Government - Local', 'Voluntary non-profit - Church',
       'Government - Federal', 'Tribal', 'Physician'], dtype=object)

In [19]:
def owner(i):
    if i=="Government - Hospital District or Authority":
        return "Government-HospitalDistrictorAuthority"
    
    elif i=="Voluntary non-profit - Private":
        return "Voluntarynon-profit-Private"
    
    elif i=="Proprietary":
        return "Proprietary"
    
    elif i=="Government - State":
        return "Government-State"
    
    elif i=="Voluntary non-profit - Other":
        return "Voluntarynon-profit-Other"
    
    elif i=="Government - Local":
        return "Government-Local"
    
    elif i=="Voluntary non-profit - Church":
        return "Voluntarynon-profit-Church"
    
    elif i=="Government - Federal":
        return "Government-Federal"
    
    elif i=="Tribal":
        return "Tribal"
    
    elif i=="Physician":
        return "Physician"
    
newOwnership = []
for i in ownership: 
    #newOwnership.append(i)
    newOwnership.append(owner(i))
print(set(newOwnership))

{'Voluntarynon-profit-Church', 'Physician', 'Voluntarynon-profit-Other', 'Tribal', 'Government-State', 'Voluntarynon-profit-Private', 'Government-Local', 'Government-Federal', 'Government-HospitalDistrictorAuthority', 'Proprietary'}


In [20]:
hospitalType = data3['Hospital Type']
hospitalType.unique()

array(['Acute Care Hospitals', 'Critical Access Hospitals', 'Childrens'],
      dtype=object)

In [21]:
def hosType(i):
    if i=="Acute Care Hospitals":
        return "AcuteCareHospitals"
    elif i=="Critical Access Hospitals":
        return "CriticalAccessHospitals"
    else:
        return "Childrens"

newHospitalType=[]
for i in hospitalType:
    #newHospitalType.append(i)
    newHospitalType.append(hosType(i))
print(set(newHospitalType))

{'AcuteCareHospitals', 'CriticalAccessHospitals', 'Childrens'}


In [22]:
def categorize1(i):
    if i=="Same as the national average":
        return "MortalitySame"
    elif i=="Below the national average":
        return "MortalityBelow"
    elif i=="Not Available":
        return "MortalityNot Available"
    else:
        return "MortalityAbove"

In [23]:
mortality = data3['Mortality national comparison']
mortality.unique()

array(['Same as the national average', 'Below the national average',
       'Not Available', 'Above the national average'], dtype=object)

In [24]:
newMortality=[]
for i in mortality:
    newMortality.append(categorize1(i))
print(set(newMortality))

{'MortalityBelow', 'MortalityAbove', 'MortalitySame', 'MortalityNot Available'}


In [25]:
def categorize2(i):
    if i=="Same as the national average":
        return "SafetySame"
    elif i=="Below the national average":
        return "SafetyBelow"
    elif i=="Not Available":
        return "SafetyNot Available"
    else:
        return "SafetyAbove"

In [26]:
safety = data3['Safety of care national comparison']
safety.unique()

array(['Above the national average', 'Same as the national average',
       'Not Available', 'Below the national average'], dtype=object)

In [27]:
newSafety=[]
for i in safety:
    newSafety.append(categorize2(i))
print(set(newSafety))

{'SafetyBelow', 'SafetyAbove', 'SafetySame', 'SafetyNot Available'}


In [28]:
def categorize3(i):
    if i=="Same as the national average":
        return "readmissionSame"
    elif i=="Below the national average":
        return "readmissionBelow"
    elif i=="Not Available":
        return "readmissionNot Available"
    else:
        return "readmissionAbove"

In [29]:
readmission = data3['Readmission national comparison']
readmission.unique()

array(['Same as the national average', 'Above the national average',
       'Below the national average', 'Not Available'], dtype=object)

In [30]:
newReadmission=[]
for i in readmission:
    newReadmission.append(categorize3(i))
print(set(newReadmission))

{'readmissionSame', 'readmissionNot Available', 'readmissionBelow', 'readmissionAbove'}


In [31]:
def categorize4(i):
    if i=="Same as the national average":
        return "experienceSame"
    elif i=="Below the national average":
        return "experienceBelow"
    elif i=="Not Available":
        return "experienceNot Available"
    else:
        return "experienceAbove"

In [32]:
experience = data3['Patient experience national comparison']
experience.unique()

array(['Below the national average', 'Same as the national average',
       'Not Available', 'Above the national average'], dtype=object)

In [33]:
newExperience=[]
for i in experience:
    newExperience.append(categorize4(i))
print(set(newExperience))

{'experienceNot Available', 'experienceSame', 'experienceAbove', 'experienceBelow'}


In [34]:
def categorize5(i):
    if i=="Same as the national average":
        return "effectivenessSame"
    elif i=="Below the national average":
        return "effectivenessBelow"
    elif i=="Not Available":
        return "effectivenessNot Available"
    else:
        return "effectivenessAbove"

In [35]:
effectiveness = data3['Effectiveness of care national comparison']
effectiveness.unique()

array(['Same as the national average', 'Below the national average',
       'Above the national average', 'Not Available'], dtype=object)

In [36]:
newEffectiveness=[]
for i in effectiveness:
    newEffectiveness.append(categorize5(i))
print(set(newEffectiveness))

{'effectivenessSame', 'effectivenessAbove', 'effectivenessNot Available', 'effectivenessBelow'}


In [37]:
def categorize6(i):
    if i=="Same as the national average":
        return "timelinessSame"
    elif i=="Below the national average":
        return "timelinessBelow"
    elif i=="Not Available":
        return "timelinessNot Available"
    else:
        return "timelinessAbove"

In [38]:
timeliness = data3['Timeliness of care national comparison']
timeliness.unique()

array(['Same as the national average', 'Above the national average',
       'Below the national average', 'Not Available'], dtype=object)

In [39]:
newTimeliness=[]
for i in timeliness:
    newTimeliness.append(categorize6(i))
print(set(newTimeliness))

{'timelinessNot Available', 'timelinessSame', 'timelinessAbove', 'timelinessBelow'}


In [40]:
def categorize7(i):
    if i=="Same as the national average":
        return "medImageSame"
    elif i=="Below the national average":
        return "medImageBelow"
    elif i=="Not Available":
        return "medImageNot Available"
    else:
        return "medImageAbove"

In [41]:
medImaging = data3['Efficient use of medical imaging national comparison']
medImaging.unique()

array(['Same as the national average', 'Below the national average',
       'Not Available', 'Above the national average'], dtype=object)

In [42]:
newMedImaging=[]
for i in medImaging:
    newMedImaging.append(categorize7(i))
print(set(newMedImaging))

{'medImageSame', 'medImageBelow', 'medImageNot Available', 'medImageAbove'}


In [43]:
#sample user input
#information users had in a hospital
print("Enter the following hospital infomation.")
print("")

rating = input('Rating: ')
hospitalOwner = input('Hostpital Owner: ')
hospitalType = input('Hostpital Type: ')
mortality = input('Mortality: ')
safety = input('Safety: ')
readmission = input('Readmission: ')
experience = input('Experience: ')
effectiveness = input('Effectiveness: ')
timeliness = input('Timeliness: ')
medImaging = input('Medical Imaging: ')

#hospital details based on the patient's input
patientInput = [str(rating), owner(hospitalOwner), hosType(hospitalType), 
                categorize1(mortality), categorize2(safety),categorize3(readmission)
                ,categorize4(experience),categorize5(effectiveness),categorize6(timeliness), 
                categorize7(medImaging)]

prefHospital=""
for i in patientInput:
    prefHospital+=str(i) + " "
prefHospital

Enter the following hospital infomation.

Rating: 3
Hostpital Owner: Proprietary
Hostpital Type: Childrens
Mortality: Same as the national average
Safety: Above the national average
Readmission: Below the national average
Experience: Same as the national average
Effectiveness: Same as the national average
Timeliness: Above the national average
Medical Imaging: Below the national average


'3 Proprietary Childrens MortalitySame SafetyAbove readmissionBelow experienceSame effectivenessSame timelinessAbove medImageBelow '

In [44]:
type(prefHospital)

str

In [45]:
d = {'Hospital Name': hospitalName, 'Hospital Rating': newRating, 'Hospital Ownership': newOwnership, 'Hospital Type':newHospitalType, 'Emergency Services': newEmergency, 'Mortality': newMortality, 'Safety': newSafety, 'Readmission': newReadmission, 'Experience': newExperience, 'Effectiveness': newEffectiveness, 'Timeliness': newTimeliness, 'Medical Imaging': newMedImaging}
hospitalData = pd.DataFrame(data=d)
hospitalData

Unnamed: 0,Hospital Name,Hospital Rating,Hospital Ownership,Hospital Type,Emergency Services,Mortality,Safety,Readmission,Experience,Effectiveness,Timeliness,Medical Imaging
0,SOUTHEAST ALABAMA MEDICAL CENTER,3,Government-HospitalDistrictorAuthority,AcuteCareHospitals,1,MortalitySame,SafetyAbove,readmissionSame,experienceBelow,effectivenessSame,timelinessSame,medImageSame
1,MARSHALL MEDICAL CENTER SOUTH,3,Government-HospitalDistrictorAuthority,AcuteCareHospitals,1,MortalityBelow,SafetySame,readmissionAbove,experienceSame,effectivenessSame,timelinessAbove,medImageBelow
2,ELIZA COFFEE MEMORIAL HOSPITAL,2,Government-HospitalDistrictorAuthority,AcuteCareHospitals,1,MortalityBelow,SafetySame,readmissionSame,experienceBelow,effectivenessSame,timelinessAbove,medImageSame
3,MIZELL MEMORIAL HOSPITAL,2,Voluntarynon-profit-Private,AcuteCareHospitals,1,MortalitySame,SafetyNot Available,readmissionBelow,experienceSame,effectivenessBelow,timelinessAbove,medImageNot Available
4,CRENSHAW COMMUNITY HOSPITAL,3,Proprietary,AcuteCareHospitals,1,MortalitySame,SafetyNot Available,readmissionSame,experienceNot Available,effectivenessSame,timelinessAbove,medImageNot Available
...,...,...,...,...,...,...,...,...,...,...,...,...
4807,FIRST TEXAS HOSPITAL,0,Proprietary,AcuteCareHospitals,1,MortalityNot Available,SafetyNot Available,readmissionNot Available,experienceNot Available,effectivenessNot Available,timelinessNot Available,medImageNot Available
4808,LBJ TROPICAL MEDICAL CENTER,0,Government-HospitalDistrictorAuthority,AcuteCareHospitals,1,MortalityNot Available,SafetyNot Available,readmissionNot Available,experienceNot Available,effectivenessNot Available,timelinessNot Available,medImageNot Available
4809,GUAM MEMORIAL HOSPITAL AUTHORITY,3,Government-Local,AcuteCareHospitals,1,MortalityBelow,SafetyNot Available,readmissionSame,experienceNot Available,effectivenessSame,timelinessNot Available,medImageNot Available
4810,GUAM REGIONAL MEDICAL CITY,0,Voluntarynon-profit-Private,AcuteCareHospitals,1,MortalityNot Available,SafetyNot Available,readmissionNot Available,experienceNot Available,effectivenessNot Available,timelinessNot Available,medImageNot Available


In [46]:
#emergency threshhold is 30. less than that is not an emergency
if sev>30:
    emergency=1
else:
    emergency=0
    
hospital = hospitalData[hospitalData['Emergency Services']==emergency] #filter out hospitals based on emergency 
hospital = hospital.drop(columns=['Emergency Services'], axis=1) #remove the column since we alrady know if its an emergency or not
hospital

Unnamed: 0,Hospital Name,Hospital Rating,Hospital Ownership,Hospital Type,Mortality,Safety,Readmission,Experience,Effectiveness,Timeliness,Medical Imaging
28,GREENE COUNTY HOSPITAL,3,Government-Local,AcuteCareHospitals,MortalitySame,SafetyNot Available,readmissionSame,experienceNot Available,effectivenessSame,timelinessAbove,medImageNot Available
40,ATHENS LIMESTONE HOSPITAL,2,Government-HospitalDistrictorAuthority,AcuteCareHospitals,MortalitySame,SafetySame,readmissionBelow,experienceSame,effectivenessSame,timelinessAbove,medImageBelow
83,NORTH ALABAMA SPECIALITY HOSPITAL,0,Proprietary,AcuteCareHospitals,MortalityNot Available,SafetyNot Available,readmissionNot Available,experienceNot Available,effectivenessNot Available,timelinessNot Available,medImageNot Available
87,ST VINCENTS BLOUNT,3,Voluntarynon-profit-Private,CriticalAccessHospitals,MortalitySame,SafetyNot Available,readmissionSame,experienceAbove,effectivenessSame,timelinessSame,medImageNot Available
131,TUBA CITY REGIONAL HEALTH CARE CORPORATION,3,Voluntarynon-profit-Other,AcuteCareHospitals,MortalityNot Available,SafetySame,readmissionSame,experienceBelow,effectivenessSame,timelinessNot Available,medImageNot Available
...,...,...,...,...,...,...,...,...,...,...,...
4745,MAYHILL HOSPITAL,0,Proprietary,AcuteCareHospitals,MortalityNot Available,SafetyNot Available,readmissionNot Available,experienceNot Available,effectivenessNot Available,timelinessNot Available,medImageNot Available
4771,TEXAS HEALTH HEART & VASCULAR HOSPITAL ARLINGTON,4,Proprietary,AcuteCareHospitals,MortalitySame,SafetySame,readmissionSame,experienceAbove,effectivenessNot Available,timelinessNot Available,medImageNot Available
4772,HOPEBRIDGE HOSPITAL,0,Proprietary,AcuteCareHospitals,MortalityNot Available,SafetyNot Available,readmissionNot Available,experienceNot Available,effectivenessNot Available,timelinessNot Available,medImageNot Available
4788,"ASPIRE BEHAVIORAL HEALTH OF CONROE, LLC",0,Physician,AcuteCareHospitals,MortalityNot Available,SafetyNot Available,readmissionNot Available,experienceNot Available,effectivenessNot Available,timelinessNot Available,medImageNot Available


### Cosine Similarity

In [47]:
#Function to convert whole array into a single string
def listToString(s): 
    # initialize an empty string
    str1 = " " 
    # return string  
    return (str1.join(s))

In [48]:
output = []
#getting all details and appending in a 2d array
for i in range(len(hospital)):
    b1 = hospital.iloc[i].to_numpy()
    output.append(b1)
output

[array(['GREENE COUNTY HOSPITAL', '3', 'Government-Local',
        'AcuteCareHospitals', 'MortalitySame', 'SafetyNot Available',
        'readmissionSame', 'experienceNot Available', 'effectivenessSame',
        'timelinessAbove', 'medImageNot Available'], dtype=object),
 array(['ATHENS LIMESTONE HOSPITAL', '2',
        'Government-HospitalDistrictorAuthority', 'AcuteCareHospitals',
        'MortalitySame', 'SafetySame', 'readmissionBelow',
        'experienceSame', 'effectivenessSame', 'timelinessAbove',
        'medImageBelow'], dtype=object),
 array(['NORTH ALABAMA SPECIALITY HOSPITAL', '0', 'Proprietary',
        'AcuteCareHospitals', 'MortalityNot Available',
        'SafetyNot Available', 'readmissionNot Available',
        'experienceNot Available', 'effectivenessNot Available',
        'timelinessNot Available', 'medImageNot Available'], dtype=object),
 array(['ST VINCENTS BLOUNT', '3', 'Voluntarynon-profit-Private',
        'CriticalAccessHospitals', 'MortalitySame', 'SafetyNo

In [49]:
input1 = [listToString(patientInput)] #converts whole array of input into a single string
cosineHospitalRecommendation = [] #list for the hospiral similarity percentage

clf = TfidfVectorizer()
for i in range(len(output)):
    output1 = [listToString(output[i][1:])] #converts the array into a single string
    df1 = pd.DataFrame(data={'a':input1, 'b':output1}) #dataframe for comparison
    clf.fit(df1['a'] + " " + df1['b'])

    tfidf_a = clf.transform(df1['a']).todense()

    tfidf_b = clf.transform(df1['b']).todense()

    row_similarities = [1 - spatial.distance.cosine(tfidf_a[x],tfidf_b[x]) for x in range(len(tfidf_a)) ] #Computes for cosine sim
    
    cosineHospitalRecommendation.append([output[i][0], row_similarities[0]])

cosineRecommendation = sorted(cosineHospitalRecommendation, key=lambda x:x[1], reverse=True)

In [50]:
for i in range(10):
    print(cosineRecommendation[i])

['ALLIANCEHEALTH WOODWARD', 0.6666666666666667]
['ATHENS LIMESTONE HOSPITAL', 0.6324555320336759]
['TENNOVA HEALTHCARE-JEFFERSON MEMORIAL HOSPITAL', 0.5555555555555556]
['KENTUCKIANA MEDICAL CENTER LLC', 0.5555555555555555]
['WESTSIDE REGIONAL MEDICAL CENTER', 0.5270462766947298]
['LAKE HURON MEDICAL CENTER', 0.5025189076296059]
['ALLIANCE COMMUNITY HOSPITAL', 0.5025189076296059]
['KECK HOSPITAL OF USC', 0.4811252243246882]
['CROSSROADS COMMUNITY HOSPITAL', 0.4444444444444444]
['CITIZENS MEMORIAL HOSPITAL', 0.42163702135578385]


### Jaccard Similarity

In [51]:
hospitalInfo = []

#convert 
for i in range(len(hospital)):
    info = hospital.iloc[i].tolist() #get the rows of the dataframe
    #get the necessay info and add it to a dictionary
    d = {
        'Hospital Name': info[0],
        'Hospital Details': info[1:]
    }
    hospitalInfo.append(d) #append dictionary to list
hospitalInfo

[{'Hospital Name': 'GREENE COUNTY HOSPITAL',
  'Hospital Details': ['3',
   'Government-Local',
   'AcuteCareHospitals',
   'MortalitySame',
   'SafetyNot Available',
   'readmissionSame',
   'experienceNot Available',
   'effectivenessSame',
   'timelinessAbove',
   'medImageNot Available']},
 {'Hospital Name': 'ATHENS LIMESTONE HOSPITAL',
  'Hospital Details': ['2',
   'Government-HospitalDistrictorAuthority',
   'AcuteCareHospitals',
   'MortalitySame',
   'SafetySame',
   'readmissionBelow',
   'experienceSame',
   'effectivenessSame',
   'timelinessAbove',
   'medImageBelow']},
 {'Hospital Name': 'NORTH ALABAMA SPECIALITY HOSPITAL',
  'Hospital Details': ['0',
   'Proprietary',
   'AcuteCareHospitals',
   'MortalityNot Available',
   'SafetyNot Available',
   'readmissionNot Available',
   'experienceNot Available',
   'effectivenessNot Available',
   'timelinessNot Available',
   'medImageNot Available']},
 {'Hospital Name': 'ST VINCENTS BLOUNT',
  'Hospital Details': ['3',
   'V

In [52]:
#formula for jaccard similarity is abs(a intersection b)/abs(a union b)
#we base the intersection and union to the individual index since each index represents a different category
def jaccard(a,b):
    intersection=0
    for i in range(len(a)):
        if(a[i]==b[i]): #if the indexes produce a similar result
            intersection+=1 
    union = (len(a)*2)-intersection #union is the number of elements * two and subtracted to the number of intersections
    return abs(intersection)/abs(union)

In [53]:
jaccardHospitalRecommendation = [] #list for the hospiral similarity percentage

#get the similarity percentange
def getJaccardSimilarities(patientInput):
        recommendedHospitals = []
        for i in range(len(hospitalInfo)):
            result = jaccard(patientInput, hospitalInfo[i]['Hospital Details'])
            jaccardHospitalRecommendation.append([hospitalInfo[i]['Hospital Name'], result])
        return sorted(jaccardHospitalRecommendation, key=lambda x:x[1], reverse=True) #sort based on the similarity percent

In [54]:
jaccardRecommendation = getJaccardSimilarities(patientInput)

for i in range(10):
    print(jaccardRecommendation[i])

['ALLIANCEHEALTH WOODWARD', 0.5384615384615384]
['ATHENS LIMESTONE HOSPITAL', 0.42857142857142855]
['KECK HOSPITAL OF USC', 0.42857142857142855]
['LAKE HURON MEDICAL CENTER', 0.42857142857142855]
['TENNOVA HEALTHCARE-JEFFERSON MEMORIAL HOSPITAL', 0.42857142857142855]
['MARK TWAIN MEDICAL CENTER', 0.3333333333333333]
['WESTSIDE REGIONAL MEDICAL CENTER', 0.3333333333333333]
['SARAH BUSH LINCOLN HEALTH CENTER', 0.3333333333333333]
['CROSSROADS COMMUNITY HOSPITAL', 0.3333333333333333]
['KENTUCKIANA MEDICAL CENTER LLC', 0.3333333333333333]
