In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

In [2]:
# Load the dataset
df = pd.read_parquet("Train.parquet")

In [3]:
df

Unnamed: 0,Patient-Uid,Date,Incident
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1
...,...,...,...
29080886,a0ee9f75-1c7c-11ec-94c7-16262ee38c7f,2018-07-06,DRUG_TYPE_6
29080897,a0ee1284-1c7c-11ec-a3d5-16262ee38c7f,2017-12-29,DRUG_TYPE_6
29080900,a0ee9b26-1c7c-11ec-8a40-16262ee38c7f,2018-10-18,DRUG_TYPE_10
29080903,a0ee1a92-1c7c-11ec-8341-16262ee38c7f,2015-09-18,DRUG_TYPE_6


# Positive and Negative Sets:

In [4]:
# Positive Set: Patients who have taken the "Target Drug"
positive_set = df[df["Incident"] == "TARGET DRUG"]

In [5]:
positive_set

Unnamed: 0,Patient-Uid,Date,Incident
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG
...,...,...,...
29074998,a0ef2b6d-1c7c-11ec-9172-16262ee38c7f,2018-10-12,TARGET DRUG
29075105,a0ebe423-1c7c-11ec-a5e0-16262ee38c7f,2019-07-02,TARGET DRUG
29075494,a0ebc713-1c7c-11ec-bd53-16262ee38c7f,2019-05-21,TARGET DRUG
29080031,a0ee1bdb-1c7c-11ec-90ba-16262ee38c7f,2018-06-07,TARGET DRUG


In [6]:
# Negative Set: Patients who have not taken the "Target Drug" within a certain time period
negative_set = df[~df["Patient-Uid"].isin(positive_set["Patient-Uid"])]
negative_set = negative_set.groupby("Patient-Uid").tail(1)  # Consider the most recent event for each patient

# Feature Engineering:

In [7]:
# Frequency-based features: Count of previous prescriptions within specific time intervals
positive_set["Prescription_Count"] = positive_set.groupby("Patient-Uid")["Date"].cumcount()
negative_set["Prescription_Count"] = negative_set.groupby("Patient-Uid")["Date"].cumcount()

# Time-based features: Time difference between the most recent prescription and the prediction date
prediction_date = pd.to_datetime("today") + pd.DateOffset(days=30)
positive_set["Time_Difference"] = (prediction_date - positive_set.groupby("Patient-Uid")["Date"].transform("max")).dt.days
negative_set["Time_Difference"] = (prediction_date - negative_set.groupby("Patient-Uid")["Date"].transform("max")).dt.days

# Concatenate the positive and negative sets
data = pd.concat([positive_set, negative_set])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_set["Prescription_Count"] = positive_set.groupby("Patient-Uid")["Date"].cumcount()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  positive_set["Time_Difference"] = (prediction_date - positive_set.groupby("Patient-Uid")["Date"].transform("max")).dt.days


In [8]:
data

Unnamed: 0,Patient-Uid,Date,Incident,Prescription_Count,Time_Difference
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,0,1058
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,0,1312
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,0,1337
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG,0,1042
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG,0,1058
...,...,...,...,...,...
3256795,a0e045a1-1c7c-11ec-8014-16262ee38c7f,2020-07-10,PRIMARY_DIAGNOSIS,0,1092
3256799,a0e67e2a-1c7c-11ec-b805-16262ee38c7f,2015-12-16,PRIMARY_DIAGNOSIS,0,2760
3256800,a0dec400-1c7c-11ec-80df-16262ee38c7f,2019-08-06,PRIMARY_DIAGNOSIS,0,1431
3256804,a0e09919-1c7c-11ec-9e7d-16262ee38c7f,2017-02-19,DRUG_TYPE_6,0,2329


# Model Training and Evaluation:

In [9]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(data[["Prescription_Count", "Time_Difference"]], data["Incident"] == "TARGET DRUG", test_size=0.2, random_state=42)

# Train a logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions on the validation set
y_pred = model.predict(X_val)

# Evaluate the model
report = classification_report(y_val, y_pred)
print(report)

              precision    recall  f1-score   support

       False       0.86      0.84      0.85      3520
        True       0.96      0.96      0.96     13456

    accuracy                           0.94     16976
   macro avg       0.91      0.90      0.91     16976
weighted avg       0.94      0.94      0.94     16976



# Generating Predictions:

In [10]:
# Load the test dataset
test_df = pd.read_parquet("Test.parquet")

In [11]:
test_df

Unnamed: 0,Patient-Uid,Date,Incident
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,DRUG_TYPE_0
2,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,DRUG_TYPE_2
3,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-12-05,DRUG_TYPE_1
4,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-11-04,SYMPTOM_TYPE_0
...,...,...,...
1372854,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-05-11,DRUG_TYPE_13
1372856,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2018-08-22,DRUG_TYPE_2
1372857,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-02-04,DRUG_TYPE_2
1372858,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-09-25,DRUG_TYPE_8


In [12]:
# Apply feature engineering steps to the test data
test_df["Prescription_Count"] = test_df.groupby("Patient-Uid")["Date"].cumcount()
test_df["Time_Difference"] = (prediction_date - test_df.groupby("Patient-Uid")["Date"].transform("max")).dt.days

# Make predictions for the test data
test_predictions = model.predict(test_df[["Prescription_Count", "Time_Difference"]])

In [13]:
test_predictions

array([False,  True,  True, ...,  True,  True,  True])

In [15]:
# Assuming you have the ground truth labels for the validation set as y_val

# Calculate the F1-score
f1 = f1_score(y_val, y_pred)

# Print the F1-score
print("F1-score: ", f1)

F1-score:  0.9611977912018679


In [16]:
# Create the final_submission.csv file
final_submission = pd.DataFrame({"Patient-Uid": test_df["Patient-Uid"], "Prediction": test_predictions})
final_submission.to_csv("final_submission.csv", index=False)

In [17]:
# Read the contents of the final_submission.csv file
submission_df = pd.read_csv("final_submission.csv")

# Display the contents of the dataframe
print(submission_df.head())

                            Patient-Uid  Prediction
0  a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f       False
1  a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f        True
2  a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f        True
3  a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f        True
4  a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f        True
