In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [28]:
data = pd.read_csv("Smoker_Epigenetic_df.csv")

In [29]:
data = data.drop('GSM', axis=1)

In [30]:
data = data.dropna(axis=0)

In [31]:
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()

# Apply Label Encoding to each categorical column
for column in ["Gender","Smoking Status"]:
    data[column] = label_encoder.fit_transform(data[column])

In [32]:
data["Smoking Status"].value_counts()

0    428
1    193
Name: Smoking Status, dtype: int64

In [33]:
data.corr()["Smoking Status"]

Smoking Status    1.000000
Gender           -0.093813
Age              -0.112701
cg00050873       -0.094152
cg00212031        0.075784
cg00213748       -0.086355
cg00214611        0.080138
cg00455876       -0.056613
cg01707559        0.043290
cg02004872        0.088749
cg02011394       -0.105648
cg02050847       -0.084765
cg02233190        0.106283
cg02494853        0.030415
cg02839557        0.086673
cg02842889        0.107668
cg03052502       -0.094867
cg03155755       -0.092161
cg03244189        0.075141
cg03443143       -0.088693
cg03683899        0.080832
cg03695421       -0.107407
cg03706273        0.030927
Name: Smoking Status, dtype: float64

In [34]:
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score

In [35]:
X = data.drop('Smoking Status', axis=1)
y = data['Smoking Status']

In [36]:
from imblearn.over_sampling import RandomOverSampler
over_sampler = RandomOverSampler(random_state=42)
X, y = over_sampler.fit_resample(X, y)

In [37]:
from sklearn.decomposition import PCA

In [38]:
pca = PCA(n_components=5)
pca.fit(X)
xpca = pca.transform(X)
Xpca = pca.inverse_transform(xpca)
X=Xpca

In [39]:
acc_score = []
seeds = range(0,101)

for seed in seeds:
    print(seed)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)
    xgb = XGBClassifier(n_estimators=100,enable_categorical=True)
    xgb.fit(X_train, y_train)
    preds = xgb.predict(X_test)
    acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100
    acc_score.append(acc_xgb)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100


In [40]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=acc_score.index(max(acc_score)))
xgb = XGBClassifier(n_estimators=100,enable_categorical=True)
xgb.fit(X_train, y_train)
preds = xgb.predict(X_test)
acc_xgb = (preds == y_test).sum().astype(float) / len(preds)*100

In [41]:
acc_xgb

83.65758754863813

In [42]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, classification_report, confusion_matrix

# Calculate accuracy
accuracy = accuracy_score(y_test, preds)

# Calculate precision, recall, F1-score, and AUC
precision = precision_score(y_test, preds)
recall = recall_score(y_test, preds)
f1 = f1_score(y_test, preds)

# Display classification report and confusion matrix
class_report = classification_report(y_test, preds)
conf_matrix = confusion_matrix(y_test, preds)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("\nClassification Report:\n", class_report)
print("Confusion Matrix:\n", conf_matrix)

Accuracy: 0.8365758754863813
Precision: 0.8382352941176471
Recall: 0.8507462686567164
F1 Score: 0.8444444444444444

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.82      0.83       123
           1       0.84      0.85      0.84       134

    accuracy                           0.84       257
   macro avg       0.84      0.84      0.84       257
weighted avg       0.84      0.84      0.84       257

Confusion Matrix:
 [[101  22]
 [ 20 114]]
