In [19]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [20]:
positive_file_path = 'generic_features_for_positive_patient.csv'
negative_file_path = 'generic_features_for_random_patient.csv'
positive_data = pd.read_csv(positive_file_path)
negative_data = pd.read_csv(negative_file_path)

In [21]:
positive_data['label'] = 1
negative_data['label'] = 0

In [22]:
positive_data.shape

(2321, 22)

In [23]:
negative_data.shape

(2523, 22)

In [24]:
combined_data = pd.concat([positive_data, negative_data], ignore_index=True)
combined_data

Unnamed: 0,subject_id,race,marital_status,age,average_apsiii,avg_charlson_comorbidity_index,gender,dod,avg_ph,avg_body_weight,...,average_los_icu,avg_glucose,avg_heart_rate,avg_mbp,avg_resp_rate,avg_spo2,avg_temperature,avg_systolic_blood_pressure,avg_diastolic_blood_pressure,label
0,10000032,0,0,52,54.000000,3.500000,0,1,8.184711,42.231230,...,0.410266,115.000000,96.500000,62.300000,20.700000,96.300000,37.206667,97.533333,59.383333,1
1,10002930,1,2,48,34.500000,2.416667,0,1,7.260000,71.894332,...,0.907101,214.166667,91.590909,80.738095,15.840909,97.261905,36.940000,121.595238,69.452381,1
2,10007920,0,3,52,36.000000,2.846154,1,0,7.370000,88.882546,...,3.168738,120.166667,95.129870,96.893333,24.194805,95.457143,37.185238,137.095000,82.305417,1
3,10009021,1,2,47,46.178198,6.000000,1,0,7.845535,94.328287,...,3.824683,211.585838,93.060536,74.143461,19.629919,100.153052,38.218169,111.822852,81.801660,1
4,10010848,0,2,47,22.612093,6.000000,1,0,8.281210,88.173245,...,1.363862,74.918331,83.777682,74.081028,16.743214,97.665466,36.806581,124.230769,77.461538,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4839,16257560,0,2,73,48.454545,7.241379,1,1,7.302609,79.718794,...,2.922511,118.822581,80.547816,68.132952,21.849348,97.379135,36.322950,108.526923,57.793681,0
4840,19822093,0,2,53,60.000000,6.437500,1,1,7.412778,82.652778,...,3.200972,172.828402,90.749196,87.651316,20.964630,96.977492,37.150465,122.520566,68.892250,0
4841,18136887,0,1,49,35.000000,1.000000,0,0,7.308571,64.405278,...,1.299537,138.000000,87.371429,76.424242,19.685714,97.676471,36.555556,117.398465,73.978985,0
4842,16185428,0,1,60,31.000000,4.200000,1,0,7.394211,125.240078,...,3.672650,169.640000,79.804688,101.843284,20.128000,95.272727,36.903846,145.860563,83.863720,0


In [25]:
shuffled_data = combined_data.sample(frac=1, random_state=90089).reset_index(drop=True)
shuffled_data

Unnamed: 0,subject_id,race,marital_status,age,average_apsiii,avg_charlson_comorbidity_index,gender,dod,avg_ph,avg_body_weight,...,average_los_icu,avg_glucose,avg_heart_rate,avg_mbp,avg_resp_rate,avg_spo2,avg_temperature,avg_systolic_blood_pressure,avg_diastolic_blood_pressure,label
0,13428830,1,2,27,69.129947,3.50,1,0,7.579794,84.368112,...,8.698811,212.420897,84.078109,80.336150,20.433565,97.324558,37.209771,120.212400,60.297415,1
1,18705296,4,3,62,29.041290,2.00,0,0,7.530801,91.318584,...,15.503000,115.728878,64.334094,77.775654,19.515456,99.150147,38.154978,120.171743,76.107346,1
2,13877204,0,1,77,59.500000,6.60,1,1,7.510000,76.802802,...,2.109653,171.400000,92.711864,77.686869,26.871795,96.081818,37.273824,123.007891,63.546987,0
3,17084065,5,2,59,93.613420,11.00,1,1,8.094982,76.420621,...,2.151541,113.371148,93.066379,64.185296,17.033613,97.688468,37.682519,146.000000,77.000000,1
4,10427288,3,0,73,55.000000,9.00,0,0,7.257143,112.725550,...,2.231053,203.611111,78.218182,73.528302,18.290909,97.490909,37.016000,141.018519,58.601852,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4839,11379247,0,2,58,41.614788,2.00,1,0,7.378847,81.306366,...,1.727742,84.133797,80.254289,72.781258,16.946771,97.151846,37.678755,118.000000,74.500000,1
4840,13308165,3,1,77,46.000000,7.00,1,0,7.330000,92.850282,...,1.194873,195.166667,88.000000,75.342857,18.718750,96.290323,36.653750,113.450000,74.417857,0
4841,15988126,3,1,65,51.000000,2.00,0,0,7.420000,96.796533,...,1.592292,129.166667,106.742424,71.863636,17.062500,95.846154,36.711000,111.390152,71.939394,0
4842,14849286,0,1,62,68.000000,5.25,1,1,7.390000,82.700208,...,1.885104,110.200000,97.419355,68.736842,16.370968,96.419355,36.833077,115.662281,65.980601,0


In [26]:
feature_columns = shuffled_data.columns.drop(['label', 'subject_id']).tolist()

X = shuffled_data[feature_columns]
y = shuffled_data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=90089)

model = LogisticRegression()
model.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [27]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
print('Confusion Matrix:')
print(conf_matrix)
print('Classification Report:')
print(class_report)

Accuracy: 0.77
Confusion Matrix:
[[371 115]
 [110 373]]
Classification Report:
              precision    recall  f1-score   support

           0       0.77      0.76      0.77       486
           1       0.76      0.77      0.77       483

    accuracy                           0.77       969
   macro avg       0.77      0.77      0.77       969
weighted avg       0.77      0.77      0.77       969

