# 1. Importing Libraries

In [None]:
!pip install numpy

In [None]:
import pandas as pd
from tqdm import tqdm
import numpy as np

# 2. Loading Data

## 2.1 Count Matrix (Expression Data)

In [None]:
count_matrix = pd.read_csv("ROSMAP_DATA/ROSMAP_all_counts_matrix.txt", sep = '\t')

In [None]:
count_matrix

In [None]:
count_matrix.isnull().sum().sum()

In [None]:
count_matrix[['150_120419_0_merged']]

## 2.2 Gene Length

In [None]:
gene_length = pd.read_csv("ROSMAP_DATA/geneParameters.tsv", sep = '\t')

gene_length

In [None]:
gene_length.isnull().sum()

In [None]:
gene_length['gene.length'] = gene_length['gene.length'].fillna(gene_length['gene.length'].mean())

In [None]:
gene_length = gene_length[['Gene.ID','gene.length']]
gene_length.head()

In [None]:
count_matrix.drop(index = count_matrix.index[:4], inplace=True)
count_matrix.head()

## 2.3 Merging Count Matrix and Gene Length

In [None]:
dct = dict()
for i,j in tqdm(gene_length.iterrows()):
    dct[j['Gene.ID']] = j['gene.length']

In [None]:
def return_length(n):
    try:
        return dct[n]
    except:
        print(n)
        return 0

count_matrix['length'] = count_matrix['feature'].apply(return_length)

In [None]:
count_matrix
Columns = list(count_matrix.columns) 
Columns.remove('feature')
Columns.remove('length')
feature = list(count_matrix['feature'])
count_matrix = count_matrix.set_index('feature')
count_matrix.head()

# 3. Normalizing using TPM

In [None]:
#Credit goes to: https://github.com/lucynwosu/TPM-Transcripts-Per-Million-Normalization-Python/blob/main/TPM-Transcripts-Per-Million-Normalization.ipynb

def read_counts2tpm(df):
    """
    convert read counts to TPM (transcripts per million)
    :df: a dataFrame that contains the read count with its gene length. 
    :sample_reads: read count values for all transcripts
    :gene_len: Gene length values
    :return: TPM
    """
    result = df
    sample_reads = result.loc[:, result.columns != 'length'].copy()
    gene_len = result.loc[:, ['length']]
    normalize_by_genelength = sample_reads.values / gene_len.values
    scaling_factor = (np.sum(normalize_by_genelength, axis=0).reshape(1, -1))/1e6
    normalize_sequencingdepth = normalize_by_genelength / scaling_factor
    tpm = normalize_sequencingdepth
    return tpm

In [None]:
TPM_MasterCount = pd.DataFrame(read_counts2tpm(count_matrix))

In [None]:
TPM_MasterCount
TPM_MasterCount.columns = Columns
TPM_MasterCount.insert(0, 'feature', feature, True)
TPM_MasterCount.set_index('feature')
TPM_MasterCount.head()

In [None]:
TPM_MasterCount.to_csv('Count_Matrix_TPM.csv')