In [2]:
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
from tsnecuda import TSNE
import time
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [2]:
def getParsing(inputRawData,mode):
    """
    This function will return the info based
    on input but exclues all unecessary info
    contain in the file
    
    Parameters
    ----------
    inputRawData : str
        the input file
        
    mode : str
        the mode (train, test, all)
    
    Returns
    -------
    dict
        key : str
            name of cell
        value : set(int)
            all id
    """
    # initialize all variable
    res = dict()
    nowKey = ""
    
    # delete all empty line then convert to list for traverse
    dataSplit = [x for x in inputRawData.split("\n") if x!='']
    
    # traverse start, input info in dict
    for line in dataSplit:
        lineSegment = line.split()
        idParsing = lineSegment[0].split(":")
        if (len(idParsing) == 2 and idParsing[0].isalpha() and idParsing[1].isdigit()):
            nowKey = " ".join(str(i) for i in lineSegment[1::])
        elif (len(lineSegment) == 3 and lineSegment[2].isdigit()):
            if (lineSegment[0][0] == '*' and (mode == "test" or mode == "all")): # case: test
                if (not nowKey in res.keys()):
                    res[nowKey] = set()
                res[nowKey].add(int(lineSegment[0][1::]))
            elif (lineSegment[0].isdigit() and (mode == "train" or mode == "all")): # case: train
                if (not nowKey in res.keys()):
                    res[nowKey] = set()
                res[nowKey].add(int(lineSegment[0]))
    return res

In [3]:
def getData(train):
    if train:
        path = 'data/train_data.h5'
        mode = 'train'
    else:
        path = 'data/test_data.h5'
        mode = 'test'

    # Load the data and convert to numpy array
    print("Start loading the data")
    store = pd.HDFStore(path)
    ori_rpkm = store['rpkm']
    samples = []
    cnt = 1
    total = len(ori_rpkm.index)
    for idx in ori_rpkm.index:
        if cnt % 1000 == 0:
            print(f'{cnt} / {total} finished')
        samples.append(ori_rpkm.loc[idx].tolist())
        cnt += 1
    samples = np.array(samples)
    labels = np.array([int(i.split('_')[0]) for i in ori_rpkm.index.tolist()])
    del store
    print('Finish loading the data')
    print('Samples:', samples.shape)
    print('Labels:', labels.shape)

    # Convert the label to index and save the corresponding index and cell type
    print("Start converting the labels")
    rawData = open("data/README.txt").read()
    name_id_dict = getParsing(rawData, mode)
    name_labels = []
    for l in labels:
        name = ''
        for key in name_id_dict:
            if l in name_id_dict[key]:
                name = key
                break
        name_labels.append(name)
    res_labels = []
    idx_name_dict = {}
    now_idx = 0
    for n in name_labels:
        if n not in idx_name_dict.keys():
            idx_name_dict[n] = now_idx
            res_labels.append(now_idx)
            now_idx += 1
        else:
            res_labels.append(idx_name_dict[n])
    with open(f'data/{mode}_idx_name.txt', 'w') as f:
        for key in idx_name_dict.keys():
            f.write(key)
            f.write(':')
            f.write(str(idx_name_dict[key]))
            f.write('\n')
    print("Finish converting the labels")
    return samples, res_labels


In [4]:
def tsne_test(train_data, train_labels, test_data, test_labels):
    tsne = TSNE()
    print('Start training TSNE')
    begin = time.time()
    reduced = tsne.fit_transform(train_data)
    print('Finish training')
    print(f"Time: {time.time() - begin}s")
    with open('tsne_reduced.txt', 'w') as f:
        for i in reduced:
            f.write(str(i[0]))
            f.write(' ')
            f.write(str(i[1]))
            f.write('\n')
    print('Start training classifier')
    begin = time.time()
    rfc = RandomForestClassifier()
    rfc.fit(reduced, train_labels)
    print('Finish training classifier')
    X_test = tsne.transform(test_data)
    print('Accuracy:', rfc.score(X_test, test_labels))

In [5]:
X, y = getData(False)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
tsne_test(X_train, y_train, X_test, y_test)

Start loading the data
1000 / 2855 finished
2000 / 2855 finished
Finish loading the data
Samples: (2855, 20499)
Labels: (2855,)
Start converting the labels
Finish converting the labels
Start training TSNE
Finish training
Time: 4.084206819534302s
Start training classifier
Finish training classifier


AttributeError: 'TSNE' object has no attribute 'transform'

In [2]:
import matplotlib.pyplot as plt
x = [1,3,4,5]
y = [1,3,4,5]
plt.plot(x,y)
plt.xlabel('n_estimators')
plt.ylabel('Accuracy')

'RandomForestClassifier'

In [8]:
store = pd.HDFStore('data/test_data.h5')
ori_labels = store['labels']
labels = []
for idx in ori_labels.index:
    labels.append(ori_labels[idx])

id_labels = []
idx_name_dict = {}
cnt = 0
for l in labels:
    if l not in idx_name_dict.keys():
        idx_name_dict[l] = cnt
        id_labels.append(cnt)
        cnt += 1
    else:
        id_labels.append(idx_name_dict[l])
np.unique(id_labels)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20])