In [10]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

In [21]:
# Step 1: Load and filter the dataset
df = pd.read_parquet("Train.parquet")
target_drug = "Target Drug"
df_target = df[df["Incident"] == target_drug]

In [22]:
df

Unnamed: 0,Patient-Uid,Date,Incident
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1
...,...,...,...
29080886,a0ee9f75-1c7c-11ec-94c7-16262ee38c7f,2018-07-06,DRUG_TYPE_6
29080897,a0ee1284-1c7c-11ec-a3d5-16262ee38c7f,2017-12-29,DRUG_TYPE_6
29080900,a0ee9b26-1c7c-11ec-8a40-16262ee38c7f,2018-10-18,DRUG_TYPE_10
29080903,a0ee1a92-1c7c-11ec-8341-16262ee38c7f,2015-09-18,DRUG_TYPE_6


In [23]:
# Convert the "Date" column to a proper date format
df_target["Date"] = pd.to_datetime(df_target["Date"])

In [24]:
df_target["Date"]

Series([], Name: Date, dtype: datetime64[ns])

In [30]:
# Step 2: Extract prescription intervals
df_patient_intervals = df_target.groupby("Patient-Uid")["Date"].diff().dt.days

# Check if there are prescriptions for the "Target Drug"
if df_patient_intervals.dropna().empty:
    print("No prescriptions found for the Target Drug.")
else:
    # Step 3: Apply clustering
    X = df_patient_intervals.dropna().values.reshape(-1, 1)
    kmeans = KMeans(n_clusters=3)  # Adjust the number of clusters as needed
    kmeans.fit(X)
    labels = kmeans.labels_
     # Step 4: Visualize prescription patterns
    df_target["Pattern"] = labels
    plt.figure(figsize=(10, 6))
    for pattern in range(kmeans.n_clusters):
        df_pattern = df_target[df_target["Pattern"] == pattern]
        prescriptions = df_pattern.groupby(df_pattern["Date"].dt.to_period("M")).size()
        plt.plot(prescriptions.index, prescriptions.values, label=f"Pattern {pattern + 1}")

    plt.xlabel("Time (Month)")
    plt.ylabel("Prescriptions")
    plt.title("Prescription Patterns of Target Drug")
    plt.legend()
    plt.show()

No prescriptions found for the Target Drug.


In [33]:
df_patient_intervals.dropna().empty

True

In [34]:
# Additional troubleshooting information
print("Total prescriptions:", len(df_patient_intervals.dropna()))
print("Total unique patients:", len(df_target["Patient-Uid"].unique()))
print("Sample of prescription intervals:")
print(df_patient_intervals.head())

Total prescriptions: 0
Total unique patients: 0
Sample of prescription intervals:
Series([], Name: Date, dtype: int64)
