In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
import scanpy as sc
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss, roc_curve, auc,precision_score,recall_score,f1_score

In [2]:
adata = sc.read("/data/mr423/project/data/3-OLINK_data_train_withOutlier_all.h5ad")
adata_test = sc.read("/data/mr423/project/data/3-OLINK_data_test_withOutlier_all.h5ad")

In [3]:
adata.obs

Unnamed: 0_level_0,sex,DoB_Year,DoB_Month,DoB_Day,DoB,Date_Attend,age,Age_Group
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2144829,0,1939,1,15,1939-01-15,2007-11-16,68.835044,60-70
3154285,0,1945,1,15,1945-01-15,2007-07-20,62.507871,60-70
1679423,1,1945,11,15,1945-11-15,2009-05-19,63.507187,60-70
1172610,1,1941,12,15,1941-12-15,2009-09-23,67.772758,60-70
4011532,1,1954,1,15,1954-01-15,2009-10-20,55.761807,50-60
...,...,...,...,...,...,...,...,...
1137580,1,1957,5,15,1957-05-15,2007-08-14,50.247775,50-60
3378384,0,1950,4,15,1950-04-15,2009-06-27,59.200548,50-60
1220136,0,1945,6,15,1945-06-15,2008-11-11,63.408624,60-70
4988172,0,1956,1,15,1956-01-15,2010-07-13,54.491444,50-60


In [4]:
adata_test.obs

Unnamed: 0_level_0,sex,DoB_Year,DoB_Month,DoB_Day,DoB,Date_Attend,age,Age_Group
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4365741,0,1947,12,15,1947-12-15,2008-05-28,60.451745,60-70
4192150,1,1951,3,15,1951-03-15,2010-01-26,58.869268,50-60
1847588,0,1948,1,15,1948-01-15,2010-02-13,62.080767,60-70
2209343,0,1943,3,15,1943-03-15,2009-11-24,66.696783,60-70
4713095,1,1962,3,15,1962-03-15,2006-06-10,44.238193,40-50
...,...,...,...,...,...,...,...,...
2503594,0,1947,2,15,1947-02-15,2009-08-13,62.491444,60-70
3494250,1,1945,12,15,1945-12-15,2009-11-10,63.904175,60-70
5746191,1,1951,12,15,1951-12-15,2009-06-26,57.530459,50-60
4342815,0,1942,8,15,1942-08-15,2008-03-11,65.571526,60-70


In [5]:
age_group_mapping = {
    '30-40': 0,
    '40-50': 1,
    '50-60': 2,
    '60-70': 3,
    '70-80': 4,
}

adata.obs['Age_Group_Label'] = adata.obs['Age_Group'].map(age_group_mapping)
adata_test.obs['Age_Group_Label'] = adata_test.obs['Age_Group'].map(age_group_mapping)

In [6]:
adata.obs

Unnamed: 0_level_0,sex,DoB_Year,DoB_Month,DoB_Day,DoB,Date_Attend,age,Age_Group,Age_Group_Label
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2144829,0,1939,1,15,1939-01-15,2007-11-16,68.835044,60-70,3
3154285,0,1945,1,15,1945-01-15,2007-07-20,62.507871,60-70,3
1679423,1,1945,11,15,1945-11-15,2009-05-19,63.507187,60-70,3
1172610,1,1941,12,15,1941-12-15,2009-09-23,67.772758,60-70,3
4011532,1,1954,1,15,1954-01-15,2009-10-20,55.761807,50-60,2
...,...,...,...,...,...,...,...,...,...
1137580,1,1957,5,15,1957-05-15,2007-08-14,50.247775,50-60,2
3378384,0,1950,4,15,1950-04-15,2009-06-27,59.200548,50-60,2
1220136,0,1945,6,15,1945-06-15,2008-11-11,63.408624,60-70,3
4988172,0,1956,1,15,1956-01-15,2010-07-13,54.491444,50-60,2


In [7]:
data = pd.read_csv("/data/mr423/project/data/3-OLINK_data_train_all_norm.csv")
data_test = pd.read_csv("/data/mr423/project/data/3-OLINK_data_test_all_norm.csv")

In [8]:
data=data.set_index('Id')
data_test=data_test.set_index('Id')

In [9]:
data

Unnamed: 0_level_0,EIF4EBP1,EIF4G1,EIF5A,ENAH,ENG,ENPP2,ENPP5,ENPP7,ENTPD5,EGLN1,...,CHCHD6,CHM,CHP1,CHMP6,CHMP1A,CHGB,CHGA,CHRM1,KLK1,WFDC2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2144829,0.156103,0.335547,0.130100,0.312544,0.369321,0.394242,0.256869,0.120990,0.572979,0.133999,...,0.190428,0.267989,0.075229,0.412949,0.301072,0.344849,0.171964,0.181050,0.097833,0.312948
3154285,0.199054,0.219478,0.149049,0.246452,0.450077,0.638855,0.354410,0.444193,0.496786,0.158452,...,0.247382,0.136232,0.000000,0.392001,0.391468,0.231631,0.186254,0.199211,0.122182,0.291977
1679423,0.278870,0.456557,0.126842,0.204109,0.371411,0.340023,0.244802,0.000000,0.502113,0.132524,...,0.233903,0.230272,0.106472,0.549238,0.274063,0.236622,0.165966,0.195700,0.083087,0.236544
1172610,0.373436,0.208746,0.147052,0.273067,0.336058,0.224401,0.200446,0.456935,0.578152,0.204918,...,0.187864,0.131574,0.057009,0.386624,0.155575,0.286401,0.142100,0.180326,0.637118,0.282086
4011532,0.357472,0.517743,0.163302,0.211867,0.412568,0.374036,0.354062,0.149790,0.481530,0.210503,...,0.167596,0.202356,0.138311,0.551349,0.509913,0.290553,0.119132,0.191383,0.152645,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1137580,0.327009,0.399850,0.150827,0.258535,0.394191,0.350602,0.364919,0.250173,0.588624,0.174877,...,0.190039,0.176876,0.124582,0.528691,0.397152,0.227137,0.144267,0.172892,0.090240,0.213467
3378384,0.251506,0.102156,0.152737,0.183943,0.455254,0.386770,0.324112,0.206436,0.539626,0.129014,...,0.187422,0.089019,0.112425,0.219653,0.130143,0.244557,0.155585,0.162137,0.205746,0.245806
1220136,0.200175,0.254228,0.190802,0.166639,0.397900,0.369949,0.240567,0.306946,0.597404,0.117963,...,0.156285,0.127781,0.081268,0.412508,0.262803,0.276575,0.116787,0.140523,0.137852,0.231372
4988172,0.199085,0.395712,0.151444,0.191822,0.389271,0.359196,0.342898,0.101840,0.623912,0.113531,...,0.185374,0.185091,0.112263,0.429611,0.397983,0.283071,0.137590,0.204322,0.438541,0.218087


In [10]:
adata.X

array([[0.15610342, 0.33554722, 0.13009958, ..., 0.18104968, 0.09783305,
        0.31294755],
       [0.19905436, 0.21947821, 0.14904949, ..., 0.19921145, 0.1221818 ,
        0.29197653],
       [0.27887018, 0.45655692, 0.1268418 , ..., 0.19569978, 0.08308709,
        0.23654383],
       ...,
       [0.20017455, 0.25422766, 0.19080205, ..., 0.14052317, 0.13785178,
        0.23137191],
       [0.19908542, 0.39571176, 0.15144385, ..., 0.20432226, 0.43854142,
        0.21808681],
       [0.26219315, 0.43112836, 0.1273578 , ..., 0.19244526, 0.43561455,
        0.1996472 ]])

In [11]:
# 找出索引的交集和差异
data.index.intersection(data_test.index)

Index([], dtype='int64', name='Id')

In [12]:
y_target = adata.obs['Age_Group_Label']
y_test = adata_test.obs['Age_Group_Label']

In [13]:
X_train, X_val, y_train, y_val = train_test_split(data, y_target, test_size=0.2,random_state=233)

In [14]:
X_train.index.intersection(X_val.index)

Index([], dtype='int64', name='Id')

In [15]:
model = LogisticRegression(penalty="l1", C=0.1, solver="liblinear")

In [16]:
model.fit(X_train, y_train)

In [17]:
actual_iterations = model.n_iter_[0]
print("实际收敛的迭代次数: ", actual_iterations)

实际收敛的迭代次数:  16


## 评估模型

### Result for train

In [18]:
y_train_pred = model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_pred)
conf_matrix = confusion_matrix(y_train, y_train_pred)
class_report = classification_report(y_train, y_train_pred)

print(f"Train Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Train Accuracy: 0.795596957410448
Confusion Matrix:
 [[    0     1     0     0     0]
 [    0  5826   908    56     0]
 [    0  1264  5439  2847     0]
 [    0    28   795 12478     0]
 [    0     0     0   201     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.82      0.86      0.84      6790
           2       0.76      0.57      0.65      9550
           3       0.80      0.94      0.86     13301
           4       0.00      0.00      0.00       201

    accuracy                           0.80     29843
   macro avg       0.48      0.47      0.47     29843
weighted avg       0.79      0.80      0.78     29843



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Result for val

In [19]:
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)
class_report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Validation Accuracy: 0.7839431711566814
Confusion Matrix:
 [[1454  232   17    0]
 [ 364 1353  754    0]
 [   7  192 3042    0]
 [   0    0   46    0]]
Classification Report:
               precision    recall  f1-score   support

           1       0.80      0.85      0.82      1703
           2       0.76      0.55      0.64      2471
           3       0.79      0.94      0.86      3241
           4       0.00      0.00      0.00        46

    accuracy                           0.78      7461
   macro avg       0.59      0.58      0.58      7461
weighted avg       0.78      0.78      0.77      7461



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Result for test

In [21]:
y_test_pred = model.predict(data_test)


precision = precision_score(y_test, y_test_pred, average='weighted')
recall = recall_score(y_test, y_test_pred, average='weighted')
f1_weight = f1_score(y_test, y_test_pred, average='weighted')
f1_micro = f1_score(y_test, y_test_pred, average='micro')
f1_macro = f1_score(y_test, y_test_pred, average='macro')
accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)

# 输出结果
print(f'precision: {precision}')
print(f'recall: {recall}')
print(f'f1_weight: {f1_weight}')
print(f'f1_micro: {f1_micro}')
print(f'f1_macro: {f1_macro}')
print(f'accuracy: {accuracy}')
print(f'conf_matrix: {conf_matrix}')

precision: 0.7778125339323869
recall: 0.7855247285886611
f1_weight: 0.7752256647985402
f1_micro: 0.7855247285886611
f1_macro: 0.5796106035854911
accuracy: 0.7855247285886611
conf_matrix: [[ 784  150    8    0]
 [ 159  739  431    0]
 [   5  120 1733    0]
 [   0    0   16    0]]


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
