In [2]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [4]:
# read the admissions table
df_adm = pd.read_csv('/content/drive/My Drive/ADMISSIONS.csv', parse_dates = ['ADMITTIME', 'DISCHTIME',
       'DEATHTIME'], infer_datetime_format = True, engine = "c")

In [5]:
df_adm = df_adm[['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME',
       'DEATHTIME', 'ADMISSION_TYPE']]

In [6]:
df_adm.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58976 entries, 0 to 58975
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   SUBJECT_ID      58976 non-null  int64         
 1   HADM_ID         58976 non-null  int64         
 2   ADMITTIME       58976 non-null  datetime64[ns]
 3   DISCHTIME       58976 non-null  datetime64[ns]
 4   DEATHTIME       5854 non-null   datetime64[ns]
 5   ADMISSION_TYPE  58976 non-null  object        
dtypes: datetime64[ns](3), int64(2), object(1)
memory usage: 2.7+ MB


In [7]:
df_adm = df_adm.sort_values(['SUBJECT_ID','ADMITTIME'])
df_adm = df_adm.reset_index(drop = True) # avoid old index become a new column

In [8]:
#df_adm.head(50)

In [9]:
df_adm['NEXT_ADMITTIME'] = df_adm.groupby('SUBJECT_ID').ADMITTIME.shift(-1) # shift won't change the original column, but add column by shifting

In [10]:
df_adm['NEXT_ADMISSION_TYPE'] = df_adm.groupby('SUBJECT_ID').ADMISSION_TYPE.shift(-1)

In [11]:
pd.set_option('display.max_rows', 500)
df_adm.head(200)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NaT,
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NaT,
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,
5,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,NaT,NEWBORN,NaT,
6,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,NaT,NEWBORN,NaT,
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,
8,10,184167,2103-06-28 11:36:00,2103-07-06 12:10:00,NaT,NEWBORN,NaT,
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,


In [12]:
rows = df_adm.NEXT_ADMISSION_TYPE == 'ELECTIVE'
df_adm.loc[rows,'NEXT_ADMITTIME'] = pd.NaT
df_adm.loc[rows,'NEXT_ADMISSION_TYPE'] = np.NaN

In [None]:
df_adm.head(200)

In [14]:
df_adm = df_adm.sort_values(['SUBJECT_ID','ADMITTIME'])
# back fill (this will take a little while)
df_adm[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']] = df_adm.groupby(['SUBJECT_ID'])[['NEXT_ADMITTIME','NEXT_ADMISSION_TYPE']].fillna(method = 'bfill')

In [15]:
df_adm.head(200)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NaT,
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NaT,
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,
5,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,NaT,NEWBORN,NaT,
6,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,NaT,NEWBORN,NaT,
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,
8,10,184167,2103-06-28 11:36:00,2103-07-06 12:10:00,NaT,NEWBORN,NaT,
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,


Time difference between two admissions in days:

In [16]:
df_adm['DAYS_NEXT_ADMIT']=  (df_adm.NEXT_ADMITTIME - df_adm.DISCHTIME).dt.total_seconds()/(24*60*60)

In [17]:
df_adm.head(200)

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT
0,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NaT,,
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,,
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,,
3,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NaT,,
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,,
5,7,118037,2121-05-23 15:05:00,2121-05-27 11:57:00,NaT,NEWBORN,NaT,,
6,8,159514,2117-11-20 10:22:00,2117-11-24 14:20:00,NaT,NEWBORN,NaT,,
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,,
8,10,184167,2103-06-28 11:36:00,2103-07-06 12:10:00,NaT,NEWBORN,NaT,,
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,,


In [18]:
use_cols = ['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'TEXT']

In [19]:
df_notes = pd.read_csv("/content/drive/My Drive/NOTEEVENTS.csv",usecols= use_cols, low_memory = False, engine = "c")

In [20]:
df_notes.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,CATEGORY,TEXT
0,22532,167853.0,Discharge summary,Admission Date: [**2151-7-16**] Dischar...
1,13702,107527.0,Discharge summary,Admission Date: [**2118-6-2**] Discharg...
2,13702,167118.0,Discharge summary,Admission Date: [**2119-5-4**] D...
3,13702,196489.0,Discharge summary,Admission Date: [**2124-7-21**] ...
4,26880,135453.0,Discharge summary,Admission Date: [**2162-3-3**] D...


In [21]:
df_notes.shape

(2083180, 4)

Find the last discharge summary from each patients' all admission

In [22]:

df_notes_dis_sum = df_notes.loc[df_notes.CATEGORY == 'Discharge summary']

In [23]:
df_notes_dis_sum.shape

(59652, 4)

In [24]:
del df_notes

In [25]:
df_notes_dis_sum_last = (df_notes_dis_sum.groupby(['SUBJECT_ID','HADM_ID']).nth(-1)).reset_index()

In [26]:
df_notes_dis_sum_last.shape

(52726, 4)

In [27]:
df_notes_dis_sum_last.duplicated(['HADM_ID']).sum()

0

In [28]:
df_adm.shape

(58976, 9)

Merging two tables based on Subject_ID and HADM_ID

Joining two tables:

In [29]:
df_adm_notes = pd.merge(df_adm, df_notes_dis_sum_last, on = ['SUBJECT_ID','HADM_ID'],how = 'left')

In [30]:
df_adm_notes.shape

(58976, 11)

In [31]:
df_adm_notes.TEXT.isnull().sum() / len(df_adm_notes) # percentage of text.NA

0.1059753119913185

In [32]:

df_adm_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())

ADMISSION_TYPE
ELECTIVE      375
EMERGENCY    1598
NEWBORN      4220
URGENT         57
dtype: int64

In [33]:
df_adm_notes.groupby('ADMISSION_TYPE').size()

ADMISSION_TYPE
ELECTIVE      7706
EMERGENCY    42071
NEWBORN       7863
URGENT        1336
dtype: int64

In [34]:
df_adm_notes.groupby('ADMISSION_TYPE').apply(lambda g: g.TEXT.isnull().sum())/df_adm_notes.groupby('ADMISSION_TYPE').size()

ADMISSION_TYPE
ELECTIVE     0.048663
EMERGENCY    0.037983
NEWBORN      0.536691
URGENT       0.042665
dtype: float64

In [35]:
df_adm_notes_clean = df_adm_notes.loc[df_adm_notes.ADMISSION_TYPE != "NEWBORN"]

In [36]:
df_adm_notes_clean['OUTPUT_LABEL'] = (df_adm_notes_clean.DAYS_NEXT_ADMIT < 30).astype('int')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [37]:
df_adm_notes_clean.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT,CATEGORY,TEXT,OUTPUT_LABEL
1,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,NaT,,,Discharge summary,Admission Date: [**2101-10-20**] Discharg...,0
2,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,NaT,,,Discharge summary,Admission Date: [**2191-3-16**] Discharge...,0
4,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,NaT,,,Discharge summary,Admission Date: [**2175-5-30**] Dischar...,0
7,9,150750,2149-11-09 13:06:00,2149-11-14 10:15:00,2149-11-14 10:15:00,EMERGENCY,NaT,,,Discharge summary,"Name: [**Known lastname 10050**], [**Known fi...",0
9,11,194540,2178-04-16 06:18:00,2178-05-11 19:00:00,NaT,EMERGENCY,NaT,,,Discharge summary,Admission Date: [**2178-4-16**] ...,0


In [38]:
#df_adm_notes_clean['CATEGORY']

In [39]:
df_adm_notes_clean.OUTPUT_LABEL.value_counts()

0    48109
1     3004
Name: OUTPUT_LABEL, dtype: int64

In [40]:
df_adm_notes_clean.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51113 entries, 1 to 58975
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype         
---  ------               --------------  -----         
 0   SUBJECT_ID           51113 non-null  int64         
 1   HADM_ID              51113 non-null  int64         
 2   ADMITTIME            51113 non-null  datetime64[ns]
 3   DISCHTIME            51113 non-null  datetime64[ns]
 4   DEATHTIME            5792 non-null   datetime64[ns]
 5   ADMISSION_TYPE       51113 non-null  object        
 6   NEXT_ADMITTIME       11169 non-null  datetime64[ns]
 7   NEXT_ADMISSION_TYPE  11169 non-null  object        
 8   DAYS_NEXT_ADMIT      11169 non-null  float64       
 9   CATEGORY             49083 non-null  object        
 10  TEXT                 49083 non-null  object        
 11  OUTPUT_LABEL         51113 non-null  int64         
dtypes: datetime64[ns](4), float64(1), int64(3), object(4)
memory usage: 5.1+ MB


In [41]:
df_adm_notes_clean.to_pickle('/content/drive/My Drive/Prepared_Data.pkl')

Top 30 common words in raw corpus?

Shuffling the samples

In [42]:
f_adm_notes_clean = df_adm_notes_clean.sample(n = len(df_adm_notes_clean), random_state = 42)

In [43]:
df_adm_notes_clean = df_adm_notes_clean.reset_index(drop = True)

Save 20% of the data for validation:

In [44]:
df_valid = df_adm_notes_clean.sample(frac=0.2,random_state=42)

In [45]:
df_valid.to_pickle('/content/drive/My Drive/df_valid.pkl')

In [46]:
df_train_all=df_adm_notes_clean.drop(df_valid.index)

In [47]:
rows_pos = df_train_all.OUTPUT_LABEL == 1

In [48]:
~rows_pos 

0        True
1        True
2        True
3        True
5        True
         ... 
51108    True
51109    True
51110    True
51111    True
51112    True
Name: OUTPUT_LABEL, Length: 40890, dtype: bool

In [49]:
df_train_pos = df_train_all.loc[rows_pos]
df_train_neg = df_train_all.loc[~rows_pos]

In [50]:
#downsampling : make len(neg) = len(pos)
df_train = pd.concat([df_train_pos, df_train_neg.sample(n = len(df_train_pos), random_state = 42)],axis = 0)

In [51]:
df_train = df_train.sample(n = len(df_train), random_state = 42).reset_index(drop = True)

In [52]:
#keep multiple records
df_train.to_pickle('df_train.pkl')

In [53]:
df_train.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,NEXT_ADMITTIME,NEXT_ADMISSION_TYPE,DAYS_NEXT_ADMIT,CATEGORY,TEXT,OUTPUT_LABEL
0,15453,168028,2110-12-07 23:49:00,2110-12-13 10:56:00,NaT,EMERGENCY,2110-12-14 22:44:00,EMERGENCY,1.491667,Discharge summary,Admission Date: [**2110-12-7**] Dischar...,1
1,31972,152547,2163-12-22 17:23:00,2163-12-23 04:08:00,2163-12-23 04:08:00,EMERGENCY,2163-12-23 04:08:00,EMERGENCY,0.0,Discharge summary,Admission Date: [**2163-12-22**] ...,1
2,20928,113718,2114-02-23 10:30:00,2114-03-15 14:00:00,NaT,ELECTIVE,NaT,,,Discharge summary,Admission Date: [**2114-2-23**] ...,0
3,98905,110019,2184-09-24 07:15:00,2184-09-24 20:22:00,NaT,ELECTIVE,NaT,,,,,0
4,19819,151971,2173-02-11 20:56:00,2173-02-19 15:25:00,NaT,EMERGENCY,NaT,,,Discharge summary,Admission Date: [**2173-2-11**] ...,0


Preprocess for bag-of-words (BoF):

In [54]:
df_train['TEXT'][0]

"Admission Date:  [**2110-12-7**]       Discharge Date:  [**2110-12-13**]\n\nDate of Birth:   [**2061-12-15**]       Sex:  M\n\nService:\n\nHISTORY OF PRESENT ILLNESS:  The patient is a 48 year old\nmale with a history of hepatitis C virus and hepatocellular\ncancer status post chemoembo in [**5-6**] who presented to\n[**Hospital3 4298**] E.D. on [**2110-12-7**] with hematemesis since\n[**2110-12-5**].  He had one episode of hematemesis on [**12-5**]/ and two\nepisodes on [**12-6**] as well as hematemesis and hematochezia on\n[**12-7**].  At [**Hospital3 4298**] E.D. patient was noted to have a\nhematocrit of 18 and was transfused two units of packed red\nblood cells and was noted to be having coagulopathy with INR\nof 2.1 and was given FFP as well as vitamin K.  Patient\ndenies any past history of varices or upper GI bleed.  He\ndenies recent alcohol or drug use.  He was feeling nauseated\nand began vomiting and continued to have hematemesis for\nthree days until he was transferred to

In [55]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [56]:
from nltk.stem.snowball import SnowballStemmer

In [57]:
ss = SnowballStemmer('english')
all_stopwords = stopwords.words('english')

In [58]:
'not' in all_stopwords

True

In [59]:
all_stopwords.remove('not')

In [60]:
def clean_text (x):
  x = re.sub(r'[^A-Za-z]', ' ' , x).lower()
  x = x.split()
  new_list = []
  for word in x:
    if word not in all_stopwords:
      word = ss.stem(word)
      new_list.append(word)
  return " ".join(new_list)

In [61]:
 #clean_text(df_train.TEXT) won't work
 clean_text(df_train.TEXT[0])

'admiss date discharg date date birth sex servic histori present ill patient year old male histori hepat c virus hepatocellular cancer status post chemoembo present hospit e hematemesi sinc one episod hematemesi two episod well hematemesi hematochezia hospit e patient note hematocrit transfus two unit pack red blood cell note coagulopathi inr given ffp well vitamin k patient deni past histori varic upper gi bleed deni recent alcohol drug use feel nauseat began vomit continu hematemesi three day transfer hospit definit care patient transfus total five unit pack red blood cell given vitamin k counteract coagulopathi underw egd band sclerotherapi grade lower esophag varic lower mid esophagus patient also abdomin ct followup chemoembo show portal vein thrombosi present patient deni short breath chest pain headach nausea vomit fever chill state feel much better past medic histori signific hepat c virus alcohol cirrhosi diagnos duoden ulcer status post perfor repair diagnos hepatocellular ca

In [62]:
# convert all data under TEXT into string 

In [63]:
df_train['TEXT'] = df_train['TEXT'].apply(str)

In [64]:
df_valid['TEXT'] = df_valid['TEXT'].apply(str)

In [65]:
# lambda apply clean_data function

In [66]:
df_train['TEXT'] = df_train.TEXT.apply(lambda x : clean_text(x))

In [67]:
df_valid['TEXT'] = df_valid.TEXT.apply(lambda x : clean_text(x))

In [None]:
df_valid.head()


In [None]:
df_train.head()

In [174]:
X_train = df_train.loc [:, 'TEXT']
y_train = df_train.loc [:, 'OUTPUT_LABEL']
X_test = df_valid.loc [:, 'TEXT']
y_test = df_valid.loc [:, 'OUTPUT_LABEL']

In [172]:
my_new_stop_words = ['admiss','discharg','date']

In [154]:
#Creat bag of word model

In [175]:
# define max features(words); ngram: to recognize two words as a one, like heart-attack
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(max_features = 4000, 
                     stop_words = my_new_stop_words,
                     ngram_range = (1, 2))

X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

Find top 30 common words in training corpus

In [176]:
x = cv.vocabulary_
x_sort = {k: v for k, v in sorted(x.items(), key=lambda item: item[1],reverse=True)}

In [177]:
from nltk.probability import FreqDist
fdist = FreqDist(x_sort)
top_thirty = fdist.most_common(30)
top_thirty

[('zosyn', 3999),
 ('zofran', 3998),
 ('yrs ago', 3997),
 ('yrs', 3996),
 ('yo male', 3995),
 ('yo', 3994),
 ('yesterday', 3993),
 ('yellow appear', 3992),
 ('yellow', 3991),
 ('yeast none', 3990),
 ('yeast', 3989),
 ('year one', 3988),
 ('year old', 3987),
 ('year digit', 3986),
 ('year ago', 3985),
 ('year', 3984),
 ('xs pm', 3983),
 ('xs intubat', 3982),
 ('xs', 3981),
 ('xrt', 3980),
 ('xray', 3979),
 ('xl mg', 3978),
 ('xl', 3977),
 ('xii intact', 3976),
 ('xii', 3975),
 ('wwp', 3974),
 ('wrist', 3973),
 ('wound care', 3972),
 ('wound', 3971),
 ('would', 3970)]

In [75]:
#X_train.head()

Naive Bayes model

In [178]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [179]:
y_pred = classifier.predict(X_test.toarray())

In [180]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)

0.635655507662328

In [185]:
X_train = df_train.loc [:, 'TEXT']
y_train = df_train.loc [:, 'OUTPUT_LABEL']
X_test = df_valid.loc [:, 'TEXT']
y_test = df_valid.loc [:, 'OUTPUT_LABEL']

In [186]:
#fit transform will create inversed document frequency
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer(max_features= 5000,stop_words = my_new_stop_words,ngram_range = (1, 2))
X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test)

In [187]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train.toarray(), y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [188]:
y_pred = classifier.predict(X_test.toarray())

In [189]:
from sklearn import metrics
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred)
metrics.auc(fpr, tpr)

0.637225393530531

In [190]:
score1 = accuracy_score (y_test,y_pred)
score1

0.6111708891714761

Logistic regression 


In [191]:
# logistic regression
from sklearn.linear_model import LogisticRegression
clf=LogisticRegression(C = 0.0001, penalty = 'l2', random_state = 42)
clf.fit(X_train, y_train)

LogisticRegression(C=0.0001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [192]:
model = clf
y_train_preds = model.predict_proba(X_train)[:,1]
y_test_preds = model.predict_proba(X_test)[:,1]
# why [:,1], if not, the roc_curvey will be in bad shape

In [193]:
#y_test
#y_test_preds
y_pred

array([0, 1, 1, ..., 0, 1, 0])

In [194]:
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_auc_score

fpr_train, tpr_train, thresholds_train = roc_curve(y_train, y_train_preds)
fpr_valid, tpr_valid, thresholds_valid = roc_curve(y_test, y_test_preds)


thresh = 0.5

auc_train = roc_auc_score(y_train, y_train_preds)
auc_test = roc_auc_score(y_test, y_test_preds)

In [195]:
auc_test

0.6867229559182261

In [196]:
from sklearn.metrics import classification_report

print(classification_report(y_test, model.predict(X_test)))

              precision    recall  f1-score   support

           0       0.97      0.64      0.77      9635
           1       0.10      0.64      0.17       588

    accuracy                           0.64     10223
   macro avg       0.53      0.64      0.47     10223
weighted avg       0.92      0.64      0.73     10223

