In [115]:
# Define Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

# For Regression Problems
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

# Data Preprocessing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder, OrdinalEncoder
from feature_engine.outliers import Winsorizer

# Evaluate Regression Models
from sklearn.metrics import classification_report

In [65]:
df = pd.read_csv(r'E:\hacktiv8\ftds009\dataset\healthcare-dataset-stroke-data.csv')

In [66]:
df.head()

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
0,9046,Male,67.0,0,1,Yes,Private,Urban,228.69,36.6,formerly smoked,1
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
2,31112,Male,80.0,0,1,Yes,Private,Rural,105.92,32.5,never smoked,1
3,60182,Female,49.0,0,0,Yes,Private,Urban,171.23,34.4,smokes,1
4,1665,Female,79.0,1,0,Yes,Self-employed,Rural,174.12,24.0,never smoked,1


In [67]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5110 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5110 non-null   int64  
 1   gender             5110 non-null   object 
 2   age                5110 non-null   float64
 3   hypertension       5110 non-null   int64  
 4   heart_disease      5110 non-null   int64  
 5   ever_married       5110 non-null   object 
 6   work_type          5110 non-null   object 
 7   Residence_type     5110 non-null   object 
 8   avg_glucose_level  5110 non-null   float64
 9   bmi                4909 non-null   float64
 10  smoking_status     5110 non-null   object 
 11  stroke             5110 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 479.2+ KB


In [68]:
df[df['bmi'].isna()==True]

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
1,51676,Female,61.0,0,0,Yes,Self-employed,Rural,202.21,,never smoked,1
8,27419,Female,59.0,0,0,Yes,Private,Rural,76.15,,Unknown,1
13,8213,Male,78.0,0,1,Yes,Private,Urban,219.84,,Unknown,1
19,25226,Male,57.0,0,1,No,Govt_job,Urban,217.08,,Unknown,1
27,61843,Male,58.0,0,0,Yes,Private,Rural,189.84,,Unknown,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5039,42007,Male,41.0,0,0,No,Private,Rural,70.15,,formerly smoked,0
5048,28788,Male,40.0,0,0,Yes,Private,Urban,191.15,,smokes,0
5093,32235,Female,45.0,1,0,Yes,Govt_job,Rural,95.02,,smokes,0
5099,7293,Male,40.0,0,0,Yes,Private,Rural,83.94,,smokes,0


In [69]:
df.nunique()

id                   5110
gender                  3
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3979
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

In [70]:
df['gender'].value_counts()

Female    2994
Male      2115
Other        1
Name: gender, dtype: int64

In [71]:
df[df['gender']=='Other']

Unnamed: 0,id,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status,stroke
3116,56156,Other,26.0,0,0,No,Private,Rural,143.33,22.4,formerly smoked,0


In [72]:
df.drop(index=df[df['gender']=='Other'].index,inplace=True)

In [73]:
df['gender'].value_counts()

Female    2994
Male      2115
Name: gender, dtype: int64

In [74]:
df.columns

Index(['id', 'gender', 'age', 'hypertension', 'heart_disease', 'ever_married',
       'work_type', 'Residence_type', 'avg_glucose_level', 'bmi',
       'smoking_status', 'stroke'],
      dtype='object')

In [75]:
df.nunique()

id                   5109
gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3978
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

In [76]:
df.isna().sum()

id                     0
gender                 0
age                    0
hypertension           0
heart_disease          0
ever_married           0
work_type              0
Residence_type         0
avg_glucose_level      0
bmi                  201
smoking_status         0
stroke                 0
dtype: int64

In [77]:
df.describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,5109.0,5109.0,5109.0,5109.0,5109.0,4908.0,5109.0
mean,36513.985516,43.229986,0.097475,0.054022,106.140399,28.89456,0.048738
std,21162.008804,22.613575,0.296633,0.226084,45.285004,7.85432,0.21534
min,67.0,0.08,0.0,0.0,55.12,10.3,0.0
25%,17740.0,25.0,0.0,0.0,77.24,23.5,0.0
50%,36922.0,45.0,0.0,0.0,91.88,28.1,0.0
75%,54643.0,61.0,0.0,0.0,114.09,33.1,0.0
max,72940.0,82.0,1.0,1.0,271.74,97.6,1.0


In [78]:
df['smoking_status'].value_counts()

never smoked       1892
Unknown            1544
formerly smoked     884
smokes              789
Name: smoking_status, dtype: int64

In [79]:
df[df['bmi'].isna()==True].describe()

Unnamed: 0,id,age,hypertension,heart_disease,avg_glucose_level,bmi,stroke
count,201.0,201.0,201.0,201.0,201.0,0.0,201.0
mean,23171.109453,52.049154,0.233831,0.164179,126.724627,,0.199005
std,20882.48312,22.276181,0.424323,0.371363,59.240322,,0.400249
min,67.0,0.48,0.0,0.0,57.52,,0.0
25%,4062.0,37.0,0.0,0.0,81.43,,0.0
50%,18234.0,58.0,0.0,0.0,99.87,,0.0
75%,37937.0,71.0,0.0,0.0,191.79,,0.0
max,72231.0,82.0,1.0,1.0,260.85,,1.0


In [80]:
df['bmi'].skew()

1.0550629490426457

In [81]:
df['bmi'].fillna(df['bmi'].median(),inplace=True)

In [82]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5109 entries, 0 to 5109
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 5109 non-null   int64  
 1   gender             5109 non-null   object 
 2   age                5109 non-null   float64
 3   hypertension       5109 non-null   int64  
 4   heart_disease      5109 non-null   int64  
 5   ever_married       5109 non-null   object 
 6   work_type          5109 non-null   object 
 7   Residence_type     5109 non-null   object 
 8   avg_glucose_level  5109 non-null   float64
 9   bmi                5109 non-null   float64
 10  smoking_status     5109 non-null   object 
 11  stroke             5109 non-null   int64  
dtypes: float64(3), int64(4), object(5)
memory usage: 518.9+ KB


In [83]:
df.nunique()

id                   5109
gender                  2
age                   104
hypertension            2
heart_disease           2
ever_married            2
work_type               5
Residence_type          2
avg_glucose_level    3978
bmi                   418
smoking_status          4
stroke                  2
dtype: int64

In [84]:
df.set_index('id',inplace=True)

In [85]:
df['age'].describe()

count    5109.000000
mean       43.229986
std        22.613575
min         0.080000
25%        25.000000
50%        45.000000
75%        61.000000
max        82.000000
Name: age, dtype: float64

In [86]:
n = 34

In [87]:
df_inf = df.sample(3, random_state=n)

In [88]:
df_inf.index

Int64Index([65429, 55337, 48648], dtype='int64', name='id')

In [89]:
df_train_test = df.drop(index=df_inf.index)

In [90]:
y = df_train_test['stroke']

In [91]:
X = df_train_test.drop(columns='stroke')

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=n)

In [93]:
X_train

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
14993,Male,5.0,0,0,No,children,Rural,67.28,17.7,Unknown
35846,Female,43.0,1,0,No,Self-employed,Rural,217.30,27.5,never smoked
41501,Female,47.0,0,0,Yes,Govt_job,Urban,122.32,23.9,Unknown
62914,Male,62.0,0,0,Yes,Private,Rural,60.39,26.9,Unknown
61408,Male,23.0,0,0,No,Never_worked,Urban,125.26,18.7,never smoked
...,...,...,...,...,...,...,...,...,...,...
30171,Male,27.0,0,0,No,Govt_job,Urban,95.10,24.3,formerly smoked
26528,Female,17.0,0,0,No,Private,Rural,88.65,30.3,never smoked
51693,Female,52.0,0,0,Yes,Private,Rural,173.90,35.8,never smoked
14372,Male,50.0,0,0,Yes,Self-employed,Urban,192.16,43.6,never smoked


In [94]:
#deteksi outlier untuk distribusi normal
def find_normal_boundaries(dataframe, variable):
    upper_boundary = dataframe[variable].mean() + 3 * dataframe[variable].std()
    lower_boundary = dataframe[variable].mean() - 3 * dataframe[variable].std()

    return upper_boundary, lower_boundary

#deteksi outlier untuk distribusi skewed
def find_skewed_boundaries(dataframe, variable):
    Q1 = dataframe[variable].quantile(0.25)
    Q3 = dataframe[variable].quantile(0.75)
    IQR = Q3 - Q1
    upper_boundary = Q3 + 1.5 * IQR
    lower_boundary = Q1 - 1.5 * IQR

    return upper_boundary, lower_boundary

In [95]:
X_train.head()

Unnamed: 0_level_0,gender,age,hypertension,heart_disease,ever_married,work_type,Residence_type,avg_glucose_level,bmi,smoking_status
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
14993,Male,5.0,0,0,No,children,Rural,67.28,17.7,Unknown
35846,Female,43.0,1,0,No,Self-employed,Rural,217.3,27.5,never smoked
41501,Female,47.0,0,0,Yes,Govt_job,Urban,122.32,23.9,Unknown
62914,Male,62.0,0,0,Yes,Private,Rural,60.39,26.9,Unknown
61408,Male,23.0,0,0,No,Never_worked,Urban,125.26,18.7,never smoked


In [96]:
cat_cols = ['gender','hypertension','heart_disease','ever_married','work_type','Residence_type','smoking_status']

In [97]:
num_cols = X_train.drop(columns=cat_cols).columns

In [98]:
X_train_num = X_train[num_cols]
X_train_cat = X_train[cat_cols]

X_test_num = X_test[num_cols]
X_test_cat = X_test[cat_cols]

In [99]:
#loop untuk pembuatan summary outlier
distribution = []
skewness = []
upper_limit = []
lower_limit = []
outliers_percent = []

for column in X_train_num.columns:
    col_skewness = X_train_num[column].skew()

    if (col_skewness >= -0.5) & (col_skewness <= 0.5):
        col_distribution = 'Normal'
        col_upper_limit, col_lower_limit = find_normal_boundaries(X_train_num,column)      

    else:
        col_distribution = 'Skewed'
        col_upper_limit, col_lower_limit = find_skewed_boundaries(X_train_num,column)
    
    col_outliers_percent = (len(X_train_num[X_train_num[column] > col_upper_limit]) + len(X_train_num[X_train_num[column] < col_lower_limit])) / len(X_train_num) * 100
    
    distribution.append(col_distribution)
    skewness.append(col_skewness)
    upper_limit.append(col_upper_limit)
    lower_limit.append(col_lower_limit)
    outliers_percent.append(col_outliers_percent)

outliers_summary = {'distribution_type':distribution, 'skewnewss':skewness, 'upper_limit':upper_limit, 'lower_limit':lower_limit,'outliers_percent':outliers_percent}
outliers_summary = pd.DataFrame(outliers_summary,index=X_train_num.columns)

In [100]:
X_train_num.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,3829.0,3829.0,3829.0
mean,43.243771,107.165427,28.857169
std,22.60908,46.232801,7.67175
min,0.08,55.12,10.3
25%,25.0,77.46,23.8
50%,45.0,92.64,28.1
75%,61.0,115.13,32.8
max,82.0,267.76,97.6


In [101]:
outliers_summary

Unnamed: 0,distribution_type,skewnewss,upper_limit,lower_limit,outliers_percent
age,Normal,-0.139126,111.07101,-24.583468,0.0
avg_glucose_level,Skewed,1.526533,171.635,20.955,12.666493
bmi,Skewed,1.13571,46.3,10.3,2.246017


In [119]:
#capping IQR
capping = Winsorizer(capping_method='iqr',
                                tail='both',
                                fold=1.5,
                                variables=['avg_glucose_level','bmi'])

capping.fit(X_train_num)

X_train_num_capped = capping.transform(X_train_num)
X_test_num_capped = capping.transform(X_test_num)

In [105]:
X_train_num.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,3829.0,3829.0,3829.0
mean,43.243771,107.165427,28.857169
std,22.60908,46.232801,7.67175
min,0.08,55.12,10.3
25%,25.0,77.46,23.8
50%,45.0,92.64,28.1
75%,61.0,115.13,32.8
max,82.0,267.76,97.6


In [111]:
X_train_num_capped.describe()

Unnamed: 0,age,avg_glucose_level,bmi
count,3829.0,3829.0,3829.0
mean,43.243771,101.932746,28.691904
std,22.60908,34.19894,7.091036
min,0.08,55.12,10.3
25%,25.0,77.46,23.8
50%,45.0,92.64,28.1
75%,61.0,115.13,32.8
max,82.0,171.635,46.3


In [107]:
outliers_summary

Unnamed: 0,distribution_type,skewnewss,upper_limit,lower_limit,outliers_percent
age,Normal,-0.139126,111.07101,-24.583468,0.0
avg_glucose_level,Skewed,1.526533,171.635,20.955,12.666493
bmi,Skewed,1.13571,46.3,10.3,2.246017


In [120]:
scaler = StandardScaler()
scaler.fit(X_train_num_capped)

X_train_num_scaled = scaler.transform(X_train_num_capped)
X_test_num_scaled = scaler.transform(X_test_num_capped)

In [135]:
cat_oh_cols = ['work_type','smoking_status']
cat_ord_cols = [x for x in cat_cols if x not in cat_oh_cols]

In [141]:
encoder = OrdinalEncoder()
encoder.fit(X_train_cat[cat_ord_cols])

X_train_ord = encoder.transform(X_train_cat[cat_ord_cols])
X_test_ord = encoder.transform(X_test_cat[cat_ord_cols])

In [151]:
oh_encoder = OneHotEncoder()
oh_encoder.fit(X_train_cat[cat_oh_cols])

X_train_oh = oh_encoder.transform(X_train_cat[cat_oh_cols]).toarray()
X_test_oh = oh_encoder.transform(X_test_cat[cat_oh_cols]).toarray()

In [152]:
X_train_num_scaled.shape, X_train_ord.shape, X_train_oh.shape

((3829, 3), (3829, 5), (3829, 9))

In [186]:
X_train_final = np.concatenate([X_train_num_scaled,X_train_ord,X_train_oh],axis=1)

In [191]:
X_test_final = np.concatenate([X_test_num_scaled,X_test_ord,X_test_oh],axis=1)

In [161]:
num_cols

Index(['age', 'avg_glucose_level', 'bmi'], dtype='object')

In [163]:
cat_ord_cols


['gender', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type']

In [164]:
cat_oh_cols

['work_type', 'smoking_status']

In [173]:
num_cols.tolist()

['age', 'avg_glucose_level', 'bmi']

In [171]:
cat_ord_cols

['gender', 'hypertension', 'heart_disease', 'ever_married', 'Residence_type']

In [174]:
#buat label coef
i = 0
coef_name = []
coef_label = [] + num_cols.tolist() + cat_ord_cols
while i < len(oh_encoder.categories_):
    coef_name = oh_encoder.categories_[i].flatten().tolist()
    coef_label+= coef_name
    i+=1

In [175]:
coef_label

['age',
 'avg_glucose_level',
 'bmi',
 'gender',
 'hypertension',
 'heart_disease',
 'ever_married',
 'Residence_type',
 'Govt_job',
 'Never_worked',
 'Private',
 'Self-employed',
 'children',
 'Unknown',
 'formerly smoked',
 'never smoked',
 'smokes']

In [180]:
len(coef_label)

17

In [183]:
type(coef_label)

list

In [184]:
X_train_final

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16
0,-1.691744,-1.013402,-1.550315,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,-0.010783,2.038407,-0.168108,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.166160,0.596215,-0.675857,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.829697,-1.214896,-0.252733,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.895499,0.682194,-1.409273,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3824,-0.718556,-0.199820,-0.619441,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3825,-1.160914,-0.388447,0.226808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3826,0.387339,2.038407,1.002537,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3827,0.298867,2.038407,2.102661,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [187]:
X_train_final = pd.DataFrame(X_train_final,columns=coef_label)

In [192]:
X_test_final = pd.DataFrame(X_test_final,columns=coef_label)

In [188]:
X_train_final

Unnamed: 0,age,avg_glucose_level,bmi,gender,hypertension,heart_disease,ever_married,Residence_type,Govt_job,Never_worked,Private,Self-employed,children,Unknown,formerly smoked,never smoked,smokes
0,-1.691744,-1.013402,-1.550315,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0
1,-0.010783,2.038407,-0.168108,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,0.166160,0.596215,-0.675857,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
3,0.829697,-1.214896,-0.252733,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
4,-0.895499,0.682194,-1.409273,1.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3824,-0.718556,-0.199820,-0.619441,1.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3825,-1.160914,-0.388447,0.226808,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3826,0.387339,2.038407,1.002537,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0
3827,0.298867,2.038407,2.102661,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0


In [202]:
for x in [0.1,1,10]:
    model_logreg = LogisticRegression(solver='liblinear',C=x)

    model_logreg.fit(X_train_final,y_train)

    y_train_pred = model_logreg.predict(X_train_final)
    y_test_pred = model_logreg.predict(X_test_final)

    print(x)
    print(classification_report(y_train,y_train_pred))
    print(classification_report(y_test,y_test_pred))
    print('-------------------')

0.1
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3639
           1       0.00      0.00      0.00       190

    accuracy                           0.95      3829
   macro avg       0.48      0.50      0.49      3829
weighted avg       0.90      0.95      0.93      3829

              precision    recall  f1-score   support

           0       0.95      1.00      0.98      1218
           1       0.00      0.00      0.00        59

    accuracy                           0.95      1277
   macro avg       0.48      0.50      0.49      1277
weighted avg       0.91      0.95      0.93      1277

-------------------
1
              precision    recall  f1-score   support

           0       0.95      1.00      0.97      3639
           1       0.00      0.00      0.00       190

    accuracy                           0.95      3829
   macro avg       0.48      0.50      0.49      3829
weighted avg       0.90      0.95      0.93     

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [197]:
y_train.value_counts()

0    3639
1     190
Name: stroke, dtype: int64

In [199]:
model_logreg.coef_

array([[ 1.7189954 ,  0.18915545,  0.01152845, -0.02819052,  0.46297973,
         0.31818877, -0.33292978,  0.13441269, -0.68172508, -0.50618399,
        -0.49491042, -0.79561424, -0.22640414, -0.60240487, -0.57637821,
        -0.76125883, -0.76479595]])

# KNN

In [204]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train_final,y_train)
y_test_pred = knn.predict(X_test_final)
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       0.95      0.99      0.97      1218
           1       0.00      0.00      0.00        59

    accuracy                           0.94      1277
   macro avg       0.48      0.49      0.49      1277
weighted avg       0.91      0.94      0.93      1277



# NB

In [205]:
nb = GaussianNB()
nb.fit(X_train_final,y_train)
y_test_pred = nb.predict(X_test_final)
print(classification_report(y_test,y_test_pred))

              precision    recall  f1-score   support

           0       1.00      0.32      0.48      1218
           1       0.07      0.98      0.12        59

    accuracy                           0.35      1277
   macro avg       0.53      0.65      0.30      1277
weighted avg       0.95      0.35      0.47      1277

