In [1]:
import numpy as np
import pandas as pd
import sklearn
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

In [2]:
#Load the data
df = pd.read_excel('C:/Users/Sai/Documents/Deepsphere AI/Patient Dataset1.xlsx')
df.head(7)

Unnamed: 0,PATIENT ID,AGE,EDUCATION,MARITAL STATUS,FINANCIAL STATUS,MEDICLAIM,NUMBER OF MEDICATIONS,ALCOHOL ADICTION,SMOKING,RESPIRATORY RATE,...,COUGH,FEVER,STOMACH ACHE,DROWSINESS,BLEEDING,MEDICINE 1,MEDICINE 2,MEDICINE 3,MEDICINE 4,MEDICINE 5
0,PAT001,65,3,Unmarried,Poor,Yes,1,Yes,Once a week,11,...,No,Low,No,Yes,Low,Clopidogrel,Filgrastim,Imatinib,Tramadol,Hydrocodone
1,PAT002,62,4,Unmarried,Poor,No,2,Ocassionally,Once a week,12,...,No,High,No,Yes,No,Atorvastatin,Diazepam,Imatinib,Sevoflurane,Generic Prilosec
2,PAT003,62,1,Married,Poor,Yes,10,Ocassionally,Yes,7,...,No,High,No,Yes,No,Clindamycin,EMLA cream,Olanzapine,Permethrin,Amoxicillin
3,PAT004,58,2,Unmarried,Poor,No,6,Yes,Once a week,9,...,No,High,No,Yes,Low,Cefixime,Dacarbazine,Misoprostol,Zinc Sulfate,Generic Synthroid
4,PAT005,72,3,Married,Poor,No,2,No,No,13,...,No,High,No,Yes,No,Cetrizine,Ifosfamide,Mefloquine,Zinc Sulfate,Generic Prilosec
5,PAT006,46,3,Unmarried,Middle Class,No,7,No,Yes,14,...,No,High,No,Yes,No,Betamethasone,Famotidine,Mesna,Zinc Sulfate,Generic Synthroid
6,PAT007,26,4,Unmarried,Poor,No,4,No,Yes,9,...,No,Low,No,Yes,No,Clindamycin,Filgrastim,Ipratropium bromide,Permethrin,Lisinopril


In [3]:
df.shape

(10000, 60)

# Data Preprocessing

In [4]:
#Show all of the column names
df.columns.values

array(['PATIENT ID', 'AGE', 'EDUCATION', 'MARITAL STATUS',
       'FINANCIAL STATUS', 'MEDICLAIM', 'NUMBER OF MEDICATIONS',
       'ALCOHOL ADICTION', 'SMOKING', 'RESPIRATORY RATE',
       'BLOOD PRESSURE', 'CHOLESTROL LEVEL', 'MOOD', 'SLEEP(HOURS)',
       'ENERGY LEVELS', 'HEADACHE', 'EATING HABITS', 'CONCENTRATION',
       'DRUG COST', 'FOLLOWUP CHECKUPS', 'PRESCRIBTION START DATE',
       'PRESCRIBTION END DATE', 'REFILL FREQUENCY',
       'PRESCRIBED MEDICATION PERIOD(MONTHS)', 'REFILLS DONE',
       'SEVIERITY OF ILLNESS(CHECKUP 1)',
       'SEVIERITY OF ILLNESS(CHECKUP 2)',
       'SEVIERITY OF ILLNESS(CHECKUP 3)',
       'SEVIERITY OF ILLNESS(CHECKUP 4)',
       'SEVIERITY OF ILLNESS(CHECKUP 5)', 'FREQUENCY', 'CHRONIC ILLNESS',
       'CONSULTATION TIME', 'MISTRUST', 'DIARRHEA', 'CONSTIPATION',
       'NAUSEA', 'VOMITING', 'SORE THROAT', 'OBESITY', 'WEIGHT LOSS',
       'VISION LOSS', 'VITAMIN DEFICIENCY', 'NERVE PROBLEMS',
       'FREQUENT SWEATING', 'FREQUENT URINATION', 'BLA

In [5]:
#Check for na or missing data
df.isna().sum()

PATIENT ID                              0
AGE                                     0
EDUCATION                               0
MARITAL STATUS                          0
FINANCIAL STATUS                        0
MEDICLAIM                               0
NUMBER OF MEDICATIONS                   0
ALCOHOL ADICTION                        0
SMOKING                                 0
RESPIRATORY RATE                        0
BLOOD PRESSURE                          0
CHOLESTROL LEVEL                        0
MOOD                                    0
SLEEP(HOURS)                            0
ENERGY LEVELS                           0
HEADACHE                                0
EATING HABITS                           0
CONCENTRATION                           0
DRUG COST                               0
FOLLOWUP CHECKUPS                       0
PRESCRIBTION START DATE                 0
PRESCRIBTION END DATE                   0
REFILL FREQUENCY                        0
PRESCRIBED MEDICATION PERIOD(MONTH

In [6]:
X = df.drop(['PATIENT ID', 'AGE', 'EDUCATION', 'MARITAL STATUS', 'ALCOHOL ADICTION', 'SMOKING', 'RESPIRATORY RATE', 'BLOOD PRESSURE', 'CHOLESTROL LEVEL', 'MOOD', 'SLEEP(HOURS)', 'ENERGY LEVELS', 'HEADACHE', 'EATING HABITS', 'CONCENTRATION', 'PRESCRIBTION START DATE', 'PRESCRIBTION END DATE', 'CHRONIC ILLNESS', 'MISTRUST', 'DIARRHEA', 'CONSTIPATION', 'NAUSEA', 'VOMITING', 'SORE THROAT', 'OBESITY', 'WEIGHT LOSS', 'VISION LOSS', 'VITAMIN DEFICIENCY', 'NERVE PROBLEMS', 'FREQUENT SWEATING', 'FREQUENT URINATION', 'BLADDER INFFECTION', 'LOSS OF SENSATION', 'ALLERGIES', 'COLD', 'COUGH', 'FEVER', 'STOMACH ACHE', 'DROWSINESS', 'BLEEDING', 'MEDICINE 1', 'MEDICINE 2', 'MEDICINE 3', 'MEDICINE 4', 'MEDICINE 5', 'FINANCIAL STATUS', 'MEDICLAIM', 'NUMBER OF MEDICATIONS', 'DRUG COST', 'FOLLOWUP CHECKUPS', 'FREQUENCY', 'CONSULTATION TIME'], axis = 1)

In [7]:
X.columns.values

array(['REFILL FREQUENCY', 'PRESCRIBED MEDICATION PERIOD(MONTHS)',
       'REFILLS DONE', 'SEVIERITY OF ILLNESS(CHECKUP 1)',
       'SEVIERITY OF ILLNESS(CHECKUP 2)',
       'SEVIERITY OF ILLNESS(CHECKUP 3)',
       'SEVIERITY OF ILLNESS(CHECKUP 4)',
       'SEVIERITY OF ILLNESS(CHECKUP 5)'], dtype=object)

In [8]:
#Convert all the non-numeric columns to numerical data types
for column in X.columns:
    if X[column].dtype == np.number:
        continue
    X[column] = LabelEncoder().fit_transform(X[column])

  if X[column].dtype == np.number:


In [9]:
#Check the new dataset datatypes
X.dtypes

REFILL FREQUENCY                        int32
PRESCRIBED MEDICATION PERIOD(MONTHS)    int64
REFILLS DONE                            int64
SEVIERITY OF ILLNESS(CHECKUP 1)         int64
SEVIERITY OF ILLNESS(CHECKUP 2)         int64
SEVIERITY OF ILLNESS(CHECKUP 3)         int64
SEVIERITY OF ILLNESS(CHECKUP 4)         int64
SEVIERITY OF ILLNESS(CHECKUP 5)         int64
dtype: object

In [10]:
X.head(1)

Unnamed: 0,REFILL FREQUENCY,PRESCRIBED MEDICATION PERIOD(MONTHS),REFILLS DONE,SEVIERITY OF ILLNESS(CHECKUP 1),SEVIERITY OF ILLNESS(CHECKUP 2),SEVIERITY OF ILLNESS(CHECKUP 3),SEVIERITY OF ILLNESS(CHECKUP 4),SEVIERITY OF ILLNESS(CHECKUP 5)
0,0,4,3,0,0,0,0,0


In [11]:
#Standardizing/Scaling the features
X1 = pd.DataFrame(StandardScaler().fit_transform(X))
X1.columns = X.columns

In [12]:
X1.head()

Unnamed: 0,REFILL FREQUENCY,PRESCRIBED MEDICATION PERIOD(MONTHS),REFILLS DONE,SEVIERITY OF ILLNESS(CHECKUP 1),SEVIERITY OF ILLNESS(CHECKUP 2),SEVIERITY OF ILLNESS(CHECKUP 3),SEVIERITY OF ILLNESS(CHECKUP 4),SEVIERITY OF ILLNESS(CHECKUP 5)
0,-1.201314,-0.409185,-0.232274,-2.086771,-1.096817,-0.765045,-0.559511,-0.431488
1,0.022519,-2.076436,1.524383,-2.086771,-1.096817,-0.765045,-0.559511,-0.431488
2,-1.201314,-0.409185,0.646055,-2.086771,-1.096817,-0.765045,-0.559511,-0.431488
3,1.246351,-0.825998,-0.671438,-2.086771,-1.096817,-0.765045,-0.559511,-0.431488
4,0.022519,-0.825998,0.20689,-2.086771,-1.096817,-0.765045,-0.559511,-0.431488


# Finding Adherent and Non Adherent Patients

In [17]:
#Clustering Algorithm
kmeans = KMeans(n_clusters=2)
kmeans.fit(X1)
y_kmeans = kmeans.predict(X1)

In [18]:
print(y_kmeans)

[0 0 0 ... 1 1 0]


In [26]:
df['Label'] = pd.Series(y_kmeans, index=df.index)

In [27]:
df.to_excel('C:/Users/Sai/Documents/Deepsphere AI/PatientOutput11.xlsx')

# Reason for Non-Adherence

In [70]:
#Convert all the non-numeric columns to numerical data types
for column in df.columns:
    if df[column].dtype == np.number:
        continue
    df[column] = LabelEncoder().fit_transform(df[column])

  if df[column].dtype == np.number:


In [71]:
#Scale the cleaned data
x = df.drop(['PATIENT ID', 'AGE', 'EDUCATION', 'MARITAL STATUS', 'MEDICLAIM', 'NUMBER OF MEDICATIONS', 'RESPIRATORY RATE', 'FOLLOWUP CHECKUPS', 'PRESCRIBTION START DATE', 'PRESCRIBTION END DATE', 'REFILL FREQUENCY', 'PRESCRIBED MEDICATION PERIOD(MONTHS)', 'REFILLS DONE', 'SEVIERITY OF ILLNESS(CHECKUP 1)', 'SEVIERITY OF ILLNESS(CHECKUP 2)', 'SEVIERITY OF ILLNESS(CHECKUP 3)', 'SEVIERITY OF ILLNESS(CHECKUP 4)', 'SEVIERITY OF ILLNESS(CHECKUP 5)', 'FREQUENCY', 'CONSULTATION TIME', 'FREQUENT SWEATING', 'FREQUENT URINATION', 'BLADDER INFFECTION', 'LOSS OF SENSATION', 'MEDICINE 1', 'MEDICINE 2', 'MEDICINE 3', 'MEDICINE 4', 'MEDICINE 5', 'CHOLESTROL LEVEL', 'EATING HABITS', 'CONCENTRATION', 'CONSTIPATION', 'OBESITY', 'WEIGHT LOSS', 'VISION LOSS', 'VITAMIN DEFICIENCY', 'BLEEDING', 'NAUSEA', 'HEADACHE', 'SLEEP(HOURS)', 'NERVE PROBLEMS', 'DROWSINESS', 'COUGH', 'Label'], axis = 1)
y = df['Label']

#Standardizing/Scaling the features
X2 = StandardScaler().fit_transform(x)

In [72]:
#Split the data into 80% training and 20% testing
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.2, random_state=42)

In [77]:
#Create the model
#from sklearn.ensemble import RandomForestClassifier
#model = RandomForestClassifier(n_estimators=100)
#from sklearn.tree import DecisionTreeClassifier
#model = DecisionTreeClassifier()
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
#Train the model
model.fit(x_train, y_train)

LogisticRegression()

In [81]:
predictions = model.predict(x_test)

#printing the predictions
print(predictions)

[1 1 1 ... 1 1 1]


In [82]:
from sklearn.metrics import accuracy_score
accuracy_score = accuracy_score(y_test, predictions)
print(accuracy_score)

0.718375


In [78]:
#Permutation Importance
import eli5
from eli5.sklearn import PermutationImportance
perm = PermutationImportance(model,random_state=1)
perm.fit(x_test, y_test)
eli5.show_weights(perm, feature_names = x_test.columns.tolist())

Weight,Feature
-0.0000  ± 0.0001,SMOKING
-0.0001  ± 0.0010,VOMITING
-0.0005  ± 0.0006,CHRONIC ILLNESS
-0.0005  ± 0.0002,MOOD
-0.0005  ± 0.0009,FEVER
-0.0006  ± 0.0007,FINANCIAL STATUS
-0.0008  ± 0.0005,SORE THROAT
-0.0008  ± 0.0015,ENERGY LEVELS
-0.0011  ± 0.0012,ALCOHOL ADICTION
-0.0013  ± 0.0005,BLOOD PRESSURE
