In [1]:
from pathlib import Path
import pickle

import pandas as pd
from sklearn.metrics import f1_score

from sclearn.classifiers import CorrelationsClassifier, CalibratedThresholdsClassifier

In [2]:
DATA_DIR = Path("/Users/vladimirshitov/Documents/education/Bioinformatics_Institute/ImmunoMind_project/from_team/")


PBMC_1_10X_PATH = DATA_DIR / "df_pbmc1_10x_v2_processed_svm.csv"
PBMC_1_DROP_PATH = DATA_DIR / "df_pbmc1_drop_processed_svm.csv"
PBMC_2_10X_PATH = DATA_DIR / "df_pbmc2_10x_v2_processed_svm.csv"

TARGET = "CellType"

# Load the data

In [3]:
pbmc_1_10x = pd.read_csv(PBMC_1_10X_PATH).set_index("cells")
pbmc_1_10x

Unnamed: 0_level_0,RP5-1128N12.3,RP11-171G2.1,PKDREJ,BLACAT1,RP11-556I14.2,RPL32,KCNMB2,RPL13A,OR4N4,RPL34,...,CLEC1B,EIF1,RP1-102K2.8,GP9,ABLIM3,RP11-166N17.3,RP11-524F11.1,RP11-101E7.2,RPL18A,CellType
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pbmc1_10x_v2_A_AAAGATGCAAAGTCAA,-0.01762,0.0,-0.030528,0.0,-0.01762,0.236545,0.0,0.304782,0.0,0.227559,...,-0.042173,0.382498,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.327312,0
pbmc1_10x_v2_A_AAAGCAAGTAGGAGTC,-0.01762,0.0,-0.030528,0.0,-0.01762,1.034161,0.0,0.201786,0.0,0.290946,...,-0.042173,0.431878,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.656094,0
pbmc1_10x_v2_A_AAAGCAATCGGTTCGG,-0.01762,0.0,-0.030528,0.0,-0.01762,0.426253,0.0,-4.456715,0.0,-0.068908,...,-0.042173,0.466779,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.103277,0
pbmc1_10x_v2_A_AAAGTAGTCATTTGGG,-0.01762,0.0,-0.030528,0.0,-0.01762,0.987770,0.0,0.142321,0.0,0.687382,...,-0.042173,0.557082,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.582226,0
pbmc1_10x_v2_A_AAAGTAGTCCGAGCCA,-0.01762,0.0,-0.030528,0.0,-0.01762,1.242410,0.0,-1.173258,0.0,0.834467,...,-0.042173,0.141116,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,1.325783,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pbmc1_10x_v2_B_TCGCGTTCAAAGTCAA,-0.01762,0.0,-0.030528,0.0,-0.01762,0.386597,0.0,-0.004314,0.0,-0.308063,...,-0.042173,0.177718,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.832011,6
pbmc1_10x_v2_B_TGAAAGAGTCGCGAAA,-0.01762,0.0,-0.030528,0.0,-0.01762,0.808475,0.0,0.502082,0.0,0.253961,...,-0.042173,0.034980,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,1.045143,7
pbmc1_10x_v2_B_TGTGTTTCATGCATGT,-0.01762,0.0,-0.030528,0.0,-0.01762,0.653090,0.0,-0.217025,0.0,-0.081840,...,-0.042173,0.456612,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.877768,6
pbmc1_10x_v2_B_TGTTCCGAGCCGATTT,-0.01762,0.0,-0.030528,0.0,-0.01762,0.653170,0.0,0.253844,0.0,0.484469,...,-0.042173,0.416667,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.782764,6


In [4]:
pbmc_2_10x = pd.read_csv(PBMC_2_10X_PATH).set_index("cells")
pbmc_2_10x

Unnamed: 0_level_0,RP5-1128N12.3,RP11-171G2.1,PKDREJ,BLACAT1,RP11-556I14.2,RPL32,KCNMB2,RPL13A,OR4N4,RPL34,...,CLEC1B,EIF1,RP1-102K2.8,GP9,ABLIM3,RP11-166N17.3,RP11-524F11.1,RP11-101E7.2,RPL18A,CellType
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pbmc2_10X_V2_AAACCTGAGATGGGTC,-0.01762,0.0,-0.030528,0.0,-0.01762,1.003276,0.0,0.479661,0.0,0.607435,...,-0.042173,-0.507631,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.967635,5
pbmc2_10X_V2_AAACCTGAGCGTAATA,-0.01762,0.0,-0.030528,0.0,-0.01762,0.951827,0.0,1.001209,0.0,0.524792,...,-0.042173,-0.837248,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.967635,5
pbmc2_10X_V2_AAACCTGAGCTAGGCA,-0.01762,0.0,-0.030528,0.0,-0.01762,0.810825,0.0,0.608176,0.0,0.170900,...,-0.042173,0.144149,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.723335,2
pbmc2_10X_V2_AAACCTGAGGGTCTCC,-0.01762,0.0,-0.030528,0.0,-0.01762,0.662872,0.0,0.323684,0.0,0.075692,...,-0.042173,-0.046487,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.859237,1
pbmc2_10X_V2_AAACCTGGTCCGAACC,-0.01762,0.0,-0.030528,0.0,-0.01762,1.086766,0.0,0.499821,0.0,0.791950,...,-0.042173,-0.846165,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.735692,7
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pbmc2_10X_V2_TTTGTCACACGTCTCT,-0.01762,0.0,-0.030528,0.0,-0.01762,1.014367,0.0,0.681298,0.0,0.844131,...,-0.042173,-0.421049,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,1.171969,5
pbmc2_10X_V2_TTTGTCACAGCCAATT,-0.01762,0.0,-0.030528,0.0,-0.01762,0.233719,0.0,-0.833770,0.0,0.244566,...,-0.042173,0.276007,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.223666,2
pbmc2_10X_V2_TTTGTCAGTACGCACC,-0.01762,0.0,-0.030528,0.0,-0.01762,-2.588569,0.0,-4.456715,0.0,-3.872194,...,-0.042173,-2.321578,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,-2.304526,7
pbmc2_10X_V2_TTTGTCATCAGTACGT,-0.01762,0.0,-0.030528,0.0,-0.01762,0.736492,0.0,0.360369,0.0,0.595337,...,-0.042173,0.232927,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,1.021931,5


In [5]:
pbmc_1_drop = pd.read_csv(PBMC_1_DROP_PATH).set_index("cells")
pbmc_1_drop

Unnamed: 0_level_0,RP5-1128N12.3,RP11-171G2.1,PKDREJ,BLACAT1,RP11-556I14.2,RPL32,KCNMB2,RPL13A,OR4N4,RPL34,...,CLEC1B,EIF1,RP1-102K2.8,GP9,ABLIM3,RP11-166N17.3,RP11-524F11.1,RP11-101E7.2,RPL18A,CellType
cells,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
pbmc1_Drop_AAAAGATGTGGT,-0.01762,0.0,-0.030528,0.0,-0.01762,0.151350,0.0,0.291968,0.0,0.352809,...,-0.042173,0.538113,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.056209,7
pbmc1_Drop_AAAAGGATTTCC,-0.01762,0.0,-0.030528,0.0,-0.01762,0.756594,0.0,0.517785,0.0,0.060344,...,-0.042173,0.252500,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,-2.304526,2
pbmc1_Drop_AAAAGTGTTTAA,-0.01762,0.0,-0.030528,0.0,-0.01762,0.629515,0.0,-0.004276,0.0,0.187513,...,-0.042173,0.351329,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.515672,2
pbmc1_Drop_AAAATGATGTAT,-0.01762,0.0,-0.030528,0.0,-0.01762,0.407930,0.0,0.269739,0.0,-0.376622,...,-0.042173,0.850211,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,1.042299,2
pbmc1_Drop_AAACAAACCGAC,-0.01762,0.0,-0.030528,0.0,-0.01762,0.058069,0.0,0.032035,0.0,-0.533383,...,-0.042173,0.641281,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,-0.244477,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
pbmc1_Drop_TCCACGCAAAGA,-0.01762,0.0,-0.030528,0.0,-0.01762,-0.447085,0.0,0.565932,0.0,-0.101248,...,-0.042173,-0.180648,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,-2.304526,4
pbmc1_Drop_TGAACACCCAGT,-0.01762,0.0,-0.030528,0.0,-0.01762,0.149105,0.0,0.349660,0.0,0.619208,...,-0.042173,0.101747,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.054061,4
pbmc1_Drop_TGGCCGTCAATA,-0.01762,0.0,-0.030528,0.0,-0.01762,-2.588569,0.0,0.169841,0.0,0.323683,...,-0.042173,0.283802,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,0.539177,4
pbmc1_Drop_TTAGCTCACTTC,-0.01762,0.0,-0.030528,0.0,-0.01762,0.405572,0.0,0.503892,0.0,0.309405,...,-0.042173,-2.321578,0.0,-0.0803,-0.034545,0.0,0.0,-0.01762,-0.423966,4


In [6]:
datasets = [
    (pbmc_1_10x, "PBMC_1_10X"),
    (pbmc_1_drop, "PBMC_dropseq"),
    (pbmc_2_10x, "PBMC_2_10x")
]

# Classify cells based on the correlation with clusters

In [9]:
def crossvalidate(classifier_cls, datasets, target, **classifier_params):
    for train_data, train_name in datasets:
        print(f"Training dataset: {train_name}")

        X_train = train_data.drop(target, axis="columns").to_numpy()
        y_train = train_data[target]

        model = classifier_cls(**classifier_params)
        model.fit(X_train, y_train)

        print("Thresholds:")
        for cell_type, threshold in zip(model.cell_types, model.thresholds):
            print(cell_type, round(threshold, 4), sep=": ")

        for test_data, test_name in datasets:
            X_test = test_data.drop(target, axis="columns").to_numpy()
            y_test = test_data[target]

            y_pred = model.predict(X_test)
            score = f1_score(y_test, y_pred, average="macro")

            print(f"Score on dataset {test_name}:\t{round(score, 4)}")

        print("-" * 30)
        print()

In [10]:
crossvalidate(classifier_cls=CorrelationsClassifier, datasets=datasets, target=TARGET)

Training dataset: PBMC_1_10X
Thresholds:
0: 0.3809
1: 0.76
2: -0.0321
3: 0.5349
4: 0.1294
5: 0.0781
6: -0.2085
7: -0.2634
8: 0.3902
Score on dataset PBMC_1_10X:	0.7023
Score on dataset PBMC_dropseq:	0.4311
Score on dataset PBMC_2_10x:	0.6823
------------------------------

Training dataset: PBMC_dropseq
Thresholds:
0: 0.0844
1: 0.6162
2: -0.0971
3: 0.3287
4: 0.5475
5: 0.084
6: 0.3501
7: -0.4309
8: 0.971
Score on dataset PBMC_1_10X:	0.4971
Score on dataset PBMC_dropseq:	0.6937
Score on dataset PBMC_2_10x:	0.5338
------------------------------

Training dataset: PBMC_2_10x
Thresholds:
0: 0.7747
1: 0.6669
2: -0.1146
3: 0.7489
4: 0.4769
5: 0.1531
6: 0.1082
7: -0.2567
8: 0.7352
Score on dataset PBMC_1_10X:	0.6472
Score on dataset PBMC_dropseq:	0.4034
Score on dataset PBMC_2_10x:	0.7438
------------------------------



# Use SVM pretrained model to classify cells

In [11]:
MODEL_PATH = "/Users/vladimirshitov/Documents/education/Bioinformatics_Institute/ImmunoMind_project/from_team/svm.pkl"

with open(MODEL_PATH, "rb") as f:
    model = pickle.load(f)

https://scikit-learn.org/stable/modules/model_persistence.html#security-maintainability-limitations


In [12]:
crossvalidate(
    classifier_cls=CalibratedThresholdsClassifier,
    datasets=datasets,
    target=TARGET,
    # Classifier parameters
    base_estimator=model,
    cv=3
)

Training dataset: PBMC_1_10X
Thresholds:
0: 0.5799
1: 0.0513
2: 0.1136
3: 0.1011
4: 0.0143
5: 0.0059
6: 0.0243
7: 0.0479
8: 0.8235
Score on dataset PBMC_1_10X:	0.8092
Score on dataset PBMC_dropseq:	0.6136
Score on dataset PBMC_2_10x:	0.752
------------------------------

Training dataset: PBMC_dropseq
Thresholds:
0: 0.4331
1: 0.4294
2: 0.2505
3: 0.333
4: 0.0532
5: 0.0649
6: 0.1184
7: 0.1368
8: 0.7111
Score on dataset PBMC_1_10X:	0.4638
Score on dataset PBMC_dropseq:	0.8007
Score on dataset PBMC_2_10x:	0.4832
------------------------------

Training dataset: PBMC_2_10x
Thresholds:
0: 0.6952
1: 0.0935
2: 0.1865
3: 0.16
4: 0.0047
5: 0.0868
6: 0.1865
7: 0.0886
8: 0.6345
Score on dataset PBMC_1_10X:	0.7257
Score on dataset PBMC_dropseq:	0.4983
Score on dataset PBMC_2_10x:	0.8393
------------------------------

