In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
import scanpy as sc
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, log_loss, roc_curve, auc

In [2]:
adata = sc.read("/data/mr423/project/data/3-OLINK_data_train_withOutlier_all.h5ad")
adata_test = sc.read("/data/mr423/project/data/3-OLINK_data_test_withOutlier_all.h5ad")

In [3]:
adata.obs

Unnamed: 0_level_0,sex,DoB_Year,DoB_Month,DoB_Day,DoB,Date_Attend,age,Age_Group
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2144829,0,1939,1,15,1939-01-15,2007-11-16,68.835044,60-70
3154285,0,1945,1,15,1945-01-15,2007-07-20,62.507871,60-70
1679423,1,1945,11,15,1945-11-15,2009-05-19,63.507187,60-70
1172610,1,1941,12,15,1941-12-15,2009-09-23,67.772758,60-70
4011532,1,1954,1,15,1954-01-15,2009-10-20,55.761807,50-60
...,...,...,...,...,...,...,...,...
1137580,1,1957,5,15,1957-05-15,2007-08-14,50.247775,50-60
3378384,0,1950,4,15,1950-04-15,2009-06-27,59.200548,50-60
1220136,0,1945,6,15,1945-06-15,2008-11-11,63.408624,60-70
4988172,0,1956,1,15,1956-01-15,2010-07-13,54.491444,50-60


In [4]:
adata_test.obs

Unnamed: 0_level_0,sex,DoB_Year,DoB_Month,DoB_Day,DoB,Date_Attend,age,Age_Group
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
4365741,0,1947,12,15,1947-12-15,2008-05-28,60.451745,60-70
4192150,1,1951,3,15,1951-03-15,2010-01-26,58.869268,50-60
1847588,0,1948,1,15,1948-01-15,2010-02-13,62.080767,60-70
2209343,0,1943,3,15,1943-03-15,2009-11-24,66.696783,60-70
4713095,1,1962,3,15,1962-03-15,2006-06-10,44.238193,40-50
...,...,...,...,...,...,...,...,...
2503594,0,1947,2,15,1947-02-15,2009-08-13,62.491444,60-70
3494250,1,1945,12,15,1945-12-15,2009-11-10,63.904175,60-70
5746191,1,1951,12,15,1951-12-15,2009-06-26,57.530459,50-60
4342815,0,1942,8,15,1942-08-15,2008-03-11,65.571526,60-70


In [5]:
age_group_mapping = {
    '30-40': 0,
    '40-50': 1,
    '50-60': 2,
    '60-70': 3,
    '70-80': 4,
}

adata.obs['Age_Group_Label'] = adata.obs['Age_Group'].map(age_group_mapping)
adata_test.obs['Age_Group_Label'] = adata_test.obs['Age_Group'].map(age_group_mapping)

In [6]:
adata.obs

Unnamed: 0_level_0,sex,DoB_Year,DoB_Month,DoB_Day,DoB,Date_Attend,age,Age_Group,Age_Group_Label
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2144829,0,1939,1,15,1939-01-15,2007-11-16,68.835044,60-70,3
3154285,0,1945,1,15,1945-01-15,2007-07-20,62.507871,60-70,3
1679423,1,1945,11,15,1945-11-15,2009-05-19,63.507187,60-70,3
1172610,1,1941,12,15,1941-12-15,2009-09-23,67.772758,60-70,3
4011532,1,1954,1,15,1954-01-15,2009-10-20,55.761807,50-60,2
...,...,...,...,...,...,...,...,...,...
1137580,1,1957,5,15,1957-05-15,2007-08-14,50.247775,50-60,2
3378384,0,1950,4,15,1950-04-15,2009-06-27,59.200548,50-60,2
1220136,0,1945,6,15,1945-06-15,2008-11-11,63.408624,60-70,3
4988172,0,1956,1,15,1956-01-15,2010-07-13,54.491444,50-60,2


 ### Load Binned data

In [7]:
data = pd.read_csv("/data/mr423/project/data/all_train_binned_data.csv")
data_test = pd.read_csv("/data/mr423/project/data/all_test_binned_data.csv")

In [8]:
data=data.set_index('Id')
data_test=data_test.set_index('Id')

In [9]:
data

Unnamed: 0_level_0,EIF4EBP1,EIF4G1,EIF5A,ENAH,ENG,ENPP2,ENPP5,ENPP7,ENTPD5,EGLN1,...,CHCHD6,CHM,CHP1,CHMP6,CHMP1A,CHGB,CHGA,CHRM1,KLK1,WFDC2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2144829,24,80,14,75,86,90,61,10,99,15,...,39,64,2,91,73,82,32,36,4,75
3154285,42,49,23,58,92,99,82,92,95,27,...,58,17,0,87,87,54,38,42,12,70
1679423,65,92,12,44,83,78,56,0,96,14,...,53,52,5,98,64,54,29,41,2,54
1172610,88,53,29,72,83,58,50,95,99,52,...,45,22,1,90,33,75,27,43,99,74
4011532,75,93,29,45,83,78,75,24,90,44,...,30,42,19,95,92,63,10,38,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1137580,72,84,21,57,83,76,78,54,98,31,...,36,32,12,97,84,47,19,30,3,44
3378384,70,12,38,52,95,90,83,60,98,25,...,53,7,17,63,25,69,39,42,60,69
1220136,57,73,52,43,94,92,70,84,99,18,...,39,24,3,95,75,78,17,31,30,68
4988172,47,91,28,44,90,86,83,6,99,10,...,42,41,10,94,91,70,21,49,94,53


In [10]:
# 找出索引的交集和差异
data.index.intersection(data_test.index)

Index([], dtype='int64', name='Id')

In [11]:
y_target = adata.obs['Age_Group_Label']
y_test = adata_test.obs['Age_Group_Label']

In [12]:
X_train, X_val, y_train, y_val = train_test_split(data, y_target, test_size=0.2,random_state=28)

In [13]:
X_train.index.intersection(X_val.index)

Index([], dtype='int64', name='Id')

In [14]:
model = LogisticRegression(penalty="l1", C=0.1, solver="liblinear")

In [15]:
model.fit(X_train, y_train)

In [17]:
actual_iterations = model.n_iter_[0]
print("实际收敛的迭代次数: ", actual_iterations)

实际收敛的迭代次数:  15


## Result

### Result for train

In [21]:
y_train_pred = model.predict(X_train)
accuracy = accuracy_score(y_train, y_train_pred)
conf_matrix = confusion_matrix(y_train, y_train_pred)
class_report = classification_report(y_train, y_train_pred)

print(f"Train Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Train Accuracy: 0.7974064269677982
Confusion Matrix:
 [[    0     1     0     0     0]
 [    0  5884   899    47     0]
 [    0  1287  5551  2811     0]
 [    0    29   779 12362     0]
 [    0     0     0   193     0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00         1
           1       0.82      0.86      0.84      6830
           2       0.77      0.58      0.66      9649
           3       0.80      0.94      0.86     13170
           4       0.00      0.00      0.00       193

    accuracy                           0.80     29843
   macro avg       0.48      0.48      0.47     29843
weighted avg       0.79      0.80      0.79     29843



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Result for val

In [22]:
y_val_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)
class_report = classification_report(y_val, y_val_pred)

print(f"Validation Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Validation Accuracy: 0.7911808068623509
Confusion Matrix:
 [[1435  209   19    0]
 [ 322 1334  716    0]
 [   7  231 3134    0]
 [   0    0   54    0]]
Classification Report:
               precision    recall  f1-score   support

           1       0.81      0.86      0.84      1663
           2       0.75      0.56      0.64      2372
           3       0.80      0.93      0.86      3372
           4       0.00      0.00      0.00        54

    accuracy                           0.79      7461
   macro avg       0.59      0.59      0.59      7461
weighted avg       0.78      0.79      0.78      7461



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Result for test

In [23]:
y_test_pred = model.predict(data_test)
accuracy = accuracy_score(y_test, y_test_pred)
conf_matrix = confusion_matrix(y_test, y_test_pred)
class_report = classification_report(y_test, y_test_pred)

print(f"Test Accuracy: {accuracy}")
print("Confusion Matrix:\n", conf_matrix)
print("Classification Report:\n", class_report)

Test Accuracy: 0.7857659831121834
Confusion Matrix:
 [[ 786  149    7    0]
 [ 167  745  417    0]
 [   4  128 1726    0]
 [   0    0   16    0]]
Classification Report:
               precision    recall  f1-score   support

           1       0.82      0.83      0.83       942
           2       0.73      0.56      0.63      1329
           3       0.80      0.93      0.86      1858
           4       0.00      0.00      0.00        16

    accuracy                           0.79      4145
   macro avg       0.59      0.58      0.58      4145
weighted avg       0.78      0.79      0.78      4145



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
