In [1]:
import pandas as pd
from datetime import timedelta

In [7]:
# Load the training data
train_data = pd.read_parquet('/content/drive/MyDrive/Dataset/Akaike/train.parquet')

In [8]:
train_data.head()

Unnamed: 0,Patient-Uid,Date,Incident
0,a0db1e73-1c7c-11ec-ae39-16262ee38c7f,2019-03-09,PRIMARY_DIAGNOSIS
1,a0dc93f2-1c7c-11ec-9cd2-16262ee38c7f,2015-05-16,PRIMARY_DIAGNOSIS
3,a0dc94c6-1c7c-11ec-a3a0-16262ee38c7f,2018-01-30,SYMPTOM_TYPE_0
4,a0dc950b-1c7c-11ec-b6ec-16262ee38c7f,2015-04-22,DRUG_TYPE_0
8,a0dc9543-1c7c-11ec-bb63-16262ee38c7f,2016-06-18,DRUG_TYPE_1


In [9]:
train_data.dtypes

Patient-Uid            object
Date           datetime64[ns]
Incident               object
dtype: object

In [11]:
train_data.dtypes

Patient-Uid            object
Date           datetime64[ns]
Incident               object
dtype: object

In [12]:
# Function to generate positive set
def generate_positive_set(df):
    positive_set = df[df['Incident'] == 'TARGET DRUG']
    positive_set = positive_set.drop_duplicates(subset='Patient-Uid', keep='first')
    positive_set['Target'] = 1
    return positive_set

In [13]:
# Function to generate negative set
def generate_negative_set(df, window_days=30):
    negative_set = df[df['Incident'] != 'TARGET DRUG']
    max_date = negative_set['Date'].max()
    negative_set = negative_set[negative_set['Date'] <= (max_date - timedelta(days=window_days))]
    negative_set = negative_set.drop_duplicates(subset='Patient-Uid', keep='first')
    negative_set['Target'] = 0
    return negative_set

In [14]:
# Combine positive and negative sets
positive_set = generate_positive_set(train_data)
negative_set = generate_negative_set(train_data)
combined_set = pd.concat([positive_set, negative_set])

In [15]:
combined_set

Unnamed: 0,Patient-Uid,Date,Incident,Target
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,1
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,1
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,1
3309423,a0ecc615-1c7c-11ec-aa31-16262ee38c7f,2019-11-15,TARGET DRUG,1
3309494,a0ea612f-1c7c-11ec-8cf0-16262ee38c7f,2020-03-18,TARGET DRUG,1
...,...,...,...,...
3649495,a0eefa38-1c7c-11ec-b02c-16262ee38c7f,2020-07-03,PRIMARY_DIAGNOSIS,0
4122450,a0f0d553-1c7c-11ec-a70a-16262ee38c7f,2015-05-22,SYMPTOM_TYPE_7,0
4153324,a0f0d582-1c7c-11ec-a6c1-16262ee38c7f,2020-06-05,PRIMARY_DIAGNOSIS,0
4171015,a0f0d087-1c7c-11ec-8042-16262ee38c7f,2020-07-07,PRIMARY_DIAGNOSIS,0


In [16]:
# Feature engineering: Add frequency-based features
frequency_features = combined_set.groupby('Patient-Uid')['Incident'].value_counts().unstack(fill_value=0)
combined_set = combined_set.merge(frequency_features, left_on='Patient-Uid', right_index=True)

In [17]:
combined_set.head()

Unnamed: 0,Patient-Uid,Date,Incident,Target,DRUG_TYPE_0,DRUG_TYPE_1,DRUG_TYPE_10,DRUG_TYPE_11,DRUG_TYPE_12,DRUG_TYPE_13,...,SYMPTOM_TYPE_7,SYMPTOM_TYPE_8,SYMPTOM_TYPE_9,TARGET DRUG,TEST_TYPE_0,TEST_TYPE_1,TEST_TYPE_2,TEST_TYPE_3,TEST_TYPE_4,TEST_TYPE_5
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
28215,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2019-07-03,PRIMARY_DIAGNOSIS,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
90116,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2019-07-19,SYMPTOM_TYPE_5,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,1,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0


In [18]:
# Feature engineering: Add time-based features
combined_set['Days_since_last_incident'] = combined_set.groupby('Patient-Uid')['Date'].diff().dt.days.fillna(0)

In [19]:
combined_set.head()

Unnamed: 0,Patient-Uid,Date,Incident,Target,DRUG_TYPE_0,DRUG_TYPE_1,DRUG_TYPE_10,DRUG_TYPE_11,DRUG_TYPE_12,DRUG_TYPE_13,...,SYMPTOM_TYPE_8,SYMPTOM_TYPE_9,TARGET DRUG,TEST_TYPE_0,TEST_TYPE_1,TEST_TYPE_2,TEST_TYPE_3,TEST_TYPE_4,TEST_TYPE_5,Days_since_last_incident
3294791,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2020-04-09,TARGET DRUG,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0
28215,a0eb742b-1c7c-11ec-8f61-16262ee38c7f,2019-07-03,PRIMARY_DIAGNOSIS,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,-281.0
3296990,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2018-06-12,TARGET DRUG,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0
90116,a0edaf09-1c7c-11ec-a360-16262ee38c7f,2019-07-19,SYMPTOM_TYPE_5,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,402.0
3305387,a0e9fa0e-1c7c-11ec-8dc7-16262ee38c7f,2019-06-11,TARGET DRUG,1,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0.0


In [20]:
# Split the data into features and target
X = combined_set.drop(['Target', 'Patient-Uid', 'Date', 'Incident'], axis=1)
y = combined_set['Target']

In [21]:
# Train-Validation Split
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
# Model Training
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

In [23]:
# Model Evaluation
from sklearn.metrics import f1_score

y_pred = model.predict(X_val)
f1_score_val = f1_score(y_val, y_pred)

In [31]:
# Generate Predictions for the Test Set
test_data = pd.read_parquet("/content/drive/MyDrive/Dataset/Akaike/test.parquet")

In [26]:
# Feature engineering for the test set
test_frequency_features = test_data.groupby('Patient-Uid')['Incident'].value_counts().unstack(fill_value=0)
test_data = test_data.merge(test_frequency_features, left_on='Patient-Uid', right_index=True)
test_data['Days_since_last_incident'] = test_data.groupby('Patient-Uid')['Date'].diff().dt.days.fillna(0)

In [32]:
# Feature engineering for the test set
test_frequency_features = test_data.groupby('Patient-Uid')['Incident'].value_counts().unstack(fill_value=0)
test_data = test_data.merge(test_frequency_features, left_on='Patient-Uid', right_index=True)


In [33]:
test_data.head()

Unnamed: 0,Patient-Uid,Date,Incident,DRUG_TYPE_0,DRUG_TYPE_1,DRUG_TYPE_10,DRUG_TYPE_11,DRUG_TYPE_12,DRUG_TYPE_13,DRUG_TYPE_14,...,SYMPTOM_TYPE_6,SYMPTOM_TYPE_7,SYMPTOM_TYPE_8,SYMPTOM_TYPE_9,TEST_TYPE_0,TEST_TYPE_1,TEST_TYPE_2,TEST_TYPE_3,TEST_TYPE_4,TEST_TYPE_5
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0,8,3,0,1,0,0,0,...,3,0,0,0,2,0,0,0,0,0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,DRUG_TYPE_0,8,3,0,1,0,0,0,...,3,0,0,0,2,0,0,0,0,0
2,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,DRUG_TYPE_2,8,3,0,1,0,0,0,...,3,0,0,0,2,0,0,0,0,0
3,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-12-05,DRUG_TYPE_1,8,3,0,1,0,0,0,...,3,0,0,0,2,0,0,0,0,0
4,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-11-04,SYMPTOM_TYPE_0,8,3,0,1,0,0,0,...,3,0,0,0,2,0,0,0,0,0


In [34]:
# Reindex the test set to include all incident types seen during training
all_incidents = X.columns
missing_incidents = list(set(all_incidents) - set(test_data.columns))
for incident in missing_incidents:
    test_data[incident] = 0

In [35]:
# Feature engineering: Add time-based features for test set
test_data['Days_since_last_incident'] = test_data.groupby('Patient-Uid')['Date'].diff().dt.days.fillna(0)

In [None]:
# Make predictions on the test set
X_test = test_data.drop(['Patient-Uid', 'Date', 'Incident'], axis=1)
test_predictions = model.predict(X_test)

In [37]:
# Make predictions on the test set
X_test = test_data[X.columns]  # Use only the columns present in the training set
test_predictions = model.predict(X_test)

In [38]:
# Save predictions to final_submission.csv
test_data['Predicted_Target'] = test_predictions
final_submission = test_data[['Patient-Uid', 'Predicted_Target']]
final_submission.to_csv('final_submission.csv', index=False)

In [39]:
test_data

Unnamed: 0,Patient-Uid,Date,Incident,DRUG_TYPE_0,DRUG_TYPE_1,DRUG_TYPE_10,DRUG_TYPE_11,DRUG_TYPE_12,DRUG_TYPE_13,DRUG_TYPE_14,...,SYMPTOM_TYPE_9,TEST_TYPE_0,TEST_TYPE_1,TEST_TYPE_2,TEST_TYPE_3,TEST_TYPE_4,TEST_TYPE_5,Days_since_last_incident,TARGET DRUG,Predicted_Target
0,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2016-12-08,SYMPTOM_TYPE_0,8,3,0,1,0,0,0,...,0,2,0,0,0,0,0,0.0,0,0
1,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-10-17,DRUG_TYPE_0,8,3,0,1,0,0,0,...,0,2,0,0,0,0,0,678.0,0,0
2,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-12-01,DRUG_TYPE_2,8,3,0,1,0,0,0,...,0,2,0,0,0,0,0,-320.0,0,0
3,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2018-12-05,DRUG_TYPE_1,8,3,0,1,0,0,0,...,0,2,0,0,0,0,0,369.0,0,0
4,a0f9e8a9-1c7c-11ec-8d25-16262ee38c7f,2017-11-04,SYMPTOM_TYPE_0,8,3,0,1,0,0,0,...,0,2,0,0,0,0,0,-396.0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1372854,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-05-11,DRUG_TYPE_13,1,0,0,0,0,30,0,...,0,0,0,0,0,0,0,107.0,0,0
1372856,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2018-08-22,DRUG_TYPE_2,1,0,0,0,0,30,0,...,0,0,0,0,0,0,0,468.0,0,0
1372857,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-02-04,DRUG_TYPE_2,1,0,0,0,0,30,0,...,0,0,0,0,0,0,0,-564.0,0,0
1372858,a10272c9-1c7c-11ec-b3ce-16262ee38c7f,2017-09-25,DRUG_TYPE_8,1,0,0,0,0,30,0,...,0,0,0,0,0,0,0,233.0,0,0


In [None]:
import pandas as pd
from datetime import timedelta

# Load the data
data = pd.read_parquet("Train.parquet")

# Convert the 'Date' column to datetime type
data['Date'] = pd.to_datetime(data['Date'])

# Filter data for "Target Drug" users only
target_drug_data = data[data['Incident'] == 'TARGET DRUG']

# Find the ideal treatment duration (e.g., 1 year)
ideal_treatment_duration = timedelta(days=365)

# Function to calculate the drop-off rate
def calculate_drop_off_rate(df, ideal_duration):
    # Calculate the number of months each patient continued the treatment
    df['Months_on_drug'] = (df.groupby('Patient-Uid')['Date'].diff() / pd.Timedelta(days=30)).fillna(0)

    # Count the number of patients who dropped off each month
    drop_off_rate = df[df['Months_on_drug'] < ideal_duration].groupby(df['Date'].dt.to_period('M')).size()

    return drop_off_rate

# Calculate the drop-off rate for "Target Drug"
drop_off_rate = calculate_drop_off_rate(target_drug_data, ideal_treatment_duration)

# Plot the drop-off rate over time
drop_off_rate.plot(kind='bar', figsize=(10, 6), title='Drop-off Rate of Target Drug')
plt.xlabel('Month')
plt.ylabel('Number of Patients')
plt.show()

# Analysis of events leading to drop-off
# Filter data for patients who dropped off
drop_off_patients = target_drug_data[target_drug_data['Months_on_drug'] < ideal_treatment_duration]

# Analyze events before drop-off
events_before_drop_off = drop_off_patients.groupby('Patient-Uid')['Incident'].agg(list)

# Perform further analysis or visualization to identify events leading to drop-off
# For example, plot the frequency of events before drop-off, identify common events, etc.
