In [30]:
import pandas as pd
import numpy as np
from scipy import stats
import pickle
from pygeneconverter import ensembl_to_hugo, hugo_to_ensembl
import time

In [31]:
def read_rna(file_path):

    rna = pd.read_csv(file_path, index_col=(0), sep='\t')
    rna.index = rna.index.astype(str)
    rna = rna.loc[~rna.index.duplicated(keep='first')]
    rna.dropna(axis=0, how='any', inplace=True)
    rna.dropna(axis=1, how='any', inplace=True)
    rna = np.log2(rna+1)
    rna = rna.T
    rna = stats.zscore(rna, axis=0, ddof=1)
    print('Sample file loaded')
    return rna

In [32]:
rna_data_file = 'raw_data_4samples.txt'
save_model_file = 'svm_stemformatics.sav'

In [33]:
read_rna(rna_data_file)

Sample file loaded


Unnamed: 0,ENSG00000121410,ENSG00000148584,ENSG00000166535,ENSG00000175899,ENSG00000128274,ENSG00000118017,ENSG00000081760,ENSG00000188984,ENSG00000204518,ENSG00000114771,...,ENSG00000086827,ENSG00000174442,ENSG00000122952,ENSG00000198205,ENSG00000198455,ENSG00000070476,ENSG00000203995,ENSG00000162378,ENSG00000074755,ENSG00000036549
Patient01,1.450494,1.5,0.421768,0.267274,1.07547,1.5,0.254006,,,-0.112039,...,1.309584,0.87618,1.056027,-0.540615,-0.439998,0.168674,1.243533,-0.731018,-1.175541,-1.142927
Patient02,-0.267563,-0.5,-0.853935,0.967771,0.606186,-0.5,-0.778994,,,-0.689466,...,-0.046741,-0.271878,-0.985304,1.436245,1.286828,1.231968,-1.06158,1.441367,0.710657,0.917245
Patient03,-0.341651,-0.5,-0.783798,0.166478,-1.016784,-0.5,1.309118,,,-0.64588,...,-1.124077,-1.296083,-0.710447,-0.09863,0.205836,-1.186585,-0.484753,-0.09462,-0.476434,0.755847
Patient04,-0.84128,-0.5,1.215964,-1.401524,-0.664871,-0.5,-0.78413,,,1.447385,...,-0.138766,0.691781,0.639725,-0.797,-1.052667,-0.214057,0.302799,-0.615728,0.941318,-0.530164


In [37]:
def converter(rna, gene = None):
    if gene == 'Hugo':
        df = df.T
        df = hugo_to_ensembl(rna.index)[['ENSEMBL_ID', 'HGNC_ID']].set_index('HGNC_ID')
        rna = pd.concat([df, rna], axis=1).dropna().set_index('ENSEMBL_ID')
        rna = rna.T
    elif gene == 'Ensembl':
        rna = rna
    else:
        rna = print('Please provide appropriate gene format name')
    return rna

In [40]:
def classification(model_path, rna, gene=None):
    model_name = model_path
    model = pickle.load(open(model_name, 'rb'))
    feat = model.feature_names_in_
    rna = converter(rna, gene)
    rna = rna.reindex(columns = feat, fill_value = 0).fillna(0)
    predictions=model.predict(rna)
    map_dict = {0: 'ECSC', 1 : 'MCSC', 2 : 'UCSC', 3 : 'iPCSC'}
    pred = [map_dict[element] for element in predictions]
    pred_df = pd.DataFrame({'class': pred}, index=rna.index)
    print('Classification Done!')
    return pred_df

In [43]:
result = classification(save_model_file, read_rna(rna_data_file), gene='Ensembl')
result

Sample file loaded
Classification Done!


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Unnamed: 0,class
Patient01,MCSC
Patient02,UCSC
Patient03,UCSC
Patient04,iPCSC
