In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta

In [3]:
num_samples = 100

In [24]:
data = {
    'patient_id': np.arange(num_patient_ids),
    'gender': np.random.choice(['Male', 'Female'], num_patient_ids),
    'medication_name': np.random.choice(['Medication A', 'Medication B', 'Medication C'], num_patient_ids),
    'medication_type': np.random.choice(['Tablet', 'Capsule', 'Injection'], num_patient_ids),
    'age': np.random.randint(18, 80, num_patient_ids),
    'duration_of_medication': np.random.randint(1, 365, num_patient_ids),  # Duration in days
    'adherence': np.random.choice([0, 1], num_patient_ids, p=[0.2, 0.8]),  # Assuming 80% adherence rate
    'reminder_time': [datetime.now() + timedelta(minutes=np.random.randint(1, 1440)) for _ in range(num_patient_ids)]
}

In [25]:
df = pd.DataFrame(data)

In [26]:
print(df.head())

   patient_id  gender medication_name medication_type  age  \
0           0  Female    Medication A       Injection   69   
1           1  Female    Medication A          Tablet   19   
2           2    Male    Medication A         Capsule   21   
3           3    Male    Medication C         Capsule   53   
4           4    Male    Medication B          Tablet   60   

   duration_of_medication  adherence              reminder_time  
0                     113          1 2023-07-14 17:18:30.072978  
1                     175          1 2023-07-14 10:38:30.072978  
2                     172          1 2023-07-13 22:42:30.072978  
3                     260          1 2023-07-14 02:20:30.072978  
4                     234          0 2023-07-14 18:20:30.072978  


In [27]:
df.to_csv('medication_dataset.csv', index=False)

In [28]:
print("Dataset saved as 'medication_dataset.csv'")

Dataset saved as 'medication_dataset.csv'


In [30]:
data = pd.read_csv('medication_dataset.csv')


In [31]:
data.dropna(inplace=True) 

In [32]:
categorical_cols = ['gender', 'medication_type']  # Assuming these are the categorical columns
encoder = OneHotEncoder(sparse=False)
encoded_cols = pd.DataFrame(encoder.fit_transform(data[categorical_cols]))
data = pd.concat([data, encoded_cols], axis=1)
data.drop(categorical_cols, axis=1, inplace=True)

In [33]:
numerical_cols = ['age', 'duration_of_medication']  # Assuming these are the numerical columns
scaler = StandardScaler()
data[numerical_cols] = scaler.fit_transform(data[numerical_cols])

In [34]:
X = data.drop('adherence', axis=1)  # Assuming 'adherence' is the target variable
y = data['adherence']

In [35]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [36]:
#Feature Extraction

In [37]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_extraction.text import CountVectorizer

In [38]:
data = pd.read_csv('medication_dataset.csv')


In [39]:
encoder = LabelEncoder()
data['gender_encoded'] = encoder.fit_transform(data['gender'])
# Assuming 'medication_type' is a categorical variable that needs encoding
data['medication_type_encoded'] = encoder.fit_transform(data['medication_type'])

In [40]:
vectorizer = CountVectorizer()
medication_name_features = vectorizer.fit_transform(data['medication_name'])

In [41]:
scaler = StandardScaler()
numerical_features = ['age', 'duration_of_medication']
data[numerical_features] = scaler.fit_transform(data[numerical_features])

In [42]:
selected_features = ['gender_encoded', 'medication_type_encoded', 'age', 'duration_of_medication']
data = data[selected_features + ['adherence']]


In [43]:
print(data.head())

   gender_encoded  medication_type_encoded       age  duration_of_medication  \
0               0                        1  1.267056               -0.678982   
1               0                        2 -1.649740               -0.129019   
2               1                        0 -1.533068               -0.155630   
3               1                        0  0.333681                0.624961   
4               1                        2  0.742033                0.394332   

   adherence  
0          1  
1          1  
2          1  
3          1  
4          0  


In [44]:
#Model Selection and Training

In [74]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OneHotEncoder

In [75]:
data = pd.read_csv('medication_dataset.csv')

In [76]:
data['reminder_time'] = pd.to_datetime(data['reminder_time'])

In [77]:
data['hour_of_day'] = data['reminder_time'].dt.hour
data['day_of_week'] = data['reminder_time'].dt.dayofweek

In [78]:
data.drop('reminder_time', axis=1, inplace=True)

In [79]:
categorical_cols = ['gender', 'medication_name', 'medication_type']
data = pd.get_dummies(data, columns=categorical_cols)

In [80]:
X = data.drop('adherence', axis=1)
y = data['adherence']

In [81]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [82]:
model = LogisticRegression()

In [83]:
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [84]:
y_pred = model.predict(X_test)

In [85]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.875


In [86]:
accuracy_percentage = accuracy * 100
print("Accuracy: {:.2f}%".format(accuracy_percentage))

Accuracy: 87.50%


In [87]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score

In [88]:
data = pd.read_csv('medication_dataset.csv')

In [90]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)

In [92]:
print("Accuracy:", accuracy*100)
print("Precision:", precision*100)
print("Recall:", recall*100)
print("F1 Score:", f1*100)
print("ROC AUC Score:", roc_auc*100)

Accuracy: 87.5
Precision: 92.10526315789474
Recall: 94.5945945945946
F1 Score: 93.33333333333333
ROC AUC Score: 47.2972972972973


In [93]:
import pandas as pd
import joblib


In [94]:
data = pd.read_csv('medication_dataset.csv')

In [95]:
X = data.drop('adherence', axis=1)
y = data['adherence']

In [96]:
model = LogisticRegression()

In [98]:
joblib.dump(model, 'medication_model.pkl')
print("Model saved as 'medication_model.pkl'")

Model saved as 'medication_model.pkl'
