#### Importing libraries

In [41]:
import numpy as np
import pandas as pd

#### Importing featureCounts data of human

In [43]:
data_path="/home/pinky/Videos/human_feature_count_tpm_final/LX2_S1_feature_counts.tsv"
data_frame=pd.read_csv(data_path, sep="\t",header=0) # in order to read the file through panda 
data_frame.head() 

Unnamed: 0,Geneid,Chr,Start,End,Strand,Length,Counts
0,ENSG00000223972,1;1;1;1;1;1;1;1;1,11869;12010;12179;12613;12613;12975;13221;1322...,12227;12057;12227;12721;12697;13052;13374;1440...,+;+;+;+;+;+;+;+;+,1735,0
1,ENSG00000227232,1;1;1;1;1;1;1;1;1;1;1,14404;15005;15796;16607;16858;17233;17606;1791...,14501;15038;15947;16765;17055;17368;17742;1806...,-;-;-;-;-;-;-;-;-;-;-,1351,4
2,ENSG00000278267,1,17369,17436,-,68,4
3,ENSG00000243485,1;1;1;1;1,29554;30267;30564;30976;30976,30039;30667;30667;31109;31097,+;+;+;+;+,1021,0
4,ENSG00000284332,1,30366,30503,+,138,0


##### We are going to keep gene length and counts. The other features are not necessary for Transcript per Million (TPM) counting. That's why we are going to drop all other columns.

In [46]:
data_frame.drop(['Chr','Start'],axis=1,inplace=True)
data_frame

Unnamed: 0,Geneid,End,Strand,Length,Counts
0,ENSG00000223972,12227;12057;12227;12721;12697;13052;13374;1440...,+;+;+;+;+;+;+;+;+,1735,0
1,ENSG00000227232,14501;15038;15947;16765;17055;17368;17742;1806...,-;-;-;-;-;-;-;-;-;-;-,1351,4
2,ENSG00000278267,17436,-,68,4
3,ENSG00000243485,30039;30667;30667;31109;31097,+;+;+;+;+,1021,0
4,ENSG00000284332,30503,+,138,0
...,...,...,...,...,...
60678,ENSG00000271254,6370;6370;6370;6370;7102;7102;7102;7102;7930;7...,-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;-;...,4520,770
60679,ENSG00000275405,22024,-,164,4
60680,ENSG00000275987,30580,-,144,0
60681,ENSG00000277475,32528,-,831,56


In [47]:
data_frame.drop(['End','Strand'],axis=1, inplace=True)

In [48]:
data_frame

Unnamed: 0,Geneid,Length,Counts
0,ENSG00000223972,1735,0
1,ENSG00000227232,1351,4
2,ENSG00000278267,68,4
3,ENSG00000243485,1021,0
4,ENSG00000284332,138,0
...,...,...,...
60678,ENSG00000271254,4520,770
60679,ENSG00000275405,164,4
60680,ENSG00000275987,144,0
60681,ENSG00000277475,831,56


##### Multiplication of gene length by 0.001 for convert the base pair into kilobase (kb) pair

In [49]:
data_frame_length_kb=data_frame['Length']*0.001

In [50]:
data_frame_length_kb

0        1.735
1        1.351
2        0.068
3        1.021
4        0.138
         ...  
60678    4.520
60679    0.164
60680    0.144
60681    0.831
60682    0.510
Name: Length, Length: 60683, dtype: float64

##### Adding the converted kb data into a seperate column in the data frame by the length_kb title

In [51]:
data_frame['length_kb']=data_frame_length_kb


In [52]:
data_frame

Unnamed: 0,Geneid,Length,Counts,length_kb
0,ENSG00000223972,1735,0,1.735
1,ENSG00000227232,1351,4,1.351
2,ENSG00000278267,68,4,0.068
3,ENSG00000243485,1021,0,1.021
4,ENSG00000284332,138,0,0.138
...,...,...,...,...
60678,ENSG00000271254,4520,770,4.520
60679,ENSG00000275405,164,4,0.164
60680,ENSG00000275987,144,0,0.144
60681,ENSG00000277475,831,56,0.831


##### Counting Read Per Kilobase (RPK)
Each gene read counts have to be divided by each gene length in kilobase(kb)

In [53]:
rpk= data_frame['Counts']/data_frame['length_kb']

In [54]:
rpk

0          0.000000
1          2.960770
2         58.823529
3          0.000000
4          0.000000
            ...    
60678    170.353982
60679     24.390244
60680      0.000000
60681     67.388688
60682      0.000000
Length: 60683, dtype: float64

##### Addition of rpk output as seperate column in our dataframe titled as rpk_values

In [55]:
data_frame['rpk_values']=rpk

In [56]:
data_frame

Unnamed: 0,Geneid,Length,Counts,length_kb,rpk_values
0,ENSG00000223972,1735,0,1.735,0.000000
1,ENSG00000227232,1351,4,1.351,2.960770
2,ENSG00000278267,68,4,0.068,58.823529
3,ENSG00000243485,1021,0,1.021,0.000000
4,ENSG00000284332,138,0,0.138,0.000000
...,...,...,...,...,...
60678,ENSG00000271254,4520,770,4.520,170.353982
60679,ENSG00000275405,164,4,0.164,24.390244
60680,ENSG00000275987,144,0,0.144,0.000000
60681,ENSG00000277475,831,56,0.831,67.388688


##### Summation of all of the RPK values

In [57]:
total_rpk=data_frame['rpk_values'].sum()

In [58]:
total_rpk

20853325.270145394

##### Determination of Per Million Scaling factor
Dividing the Read per Kilobase(RPK) by 1000,000

In [59]:
per_million_scalling_factor=total_rpk/1000000

In [60]:
per_million_scalling_factor

20.853325270145394

##### Counting Transcript Per Million (TPM) value
Dividing the RPK values with per million scaling factor

In [61]:
tpm=data_frame['rpk_values']/per_million_scalling_factor

In [62]:
tpm

0        0.000000
1        0.141981
2        2.820823
3        0.000000
4        0.000000
           ...   
60678    8.169152
60679    1.169609
60680    0.000000
60681    3.231556
60682    0.000000
Name: rpk_values, Length: 60683, dtype: float64

##### Addition of TPM values to the data frame

In [63]:
data_frame['tpm']=tpm

In [64]:
data_frame

Unnamed: 0,Geneid,Length,Counts,length_kb,rpk_values,tpm
0,ENSG00000223972,1735,0,1.735,0.000000,0.000000
1,ENSG00000227232,1351,4,1.351,2.960770,0.141981
2,ENSG00000278267,68,4,0.068,58.823529,2.820823
3,ENSG00000243485,1021,0,1.021,0.000000,0.000000
4,ENSG00000284332,138,0,0.138,0.000000,0.000000
...,...,...,...,...,...,...
60678,ENSG00000271254,4520,770,4.520,170.353982,8.169152
60679,ENSG00000275405,164,4,0.164,24.390244,1.169609
60680,ENSG00000275987,144,0,0.144,0.000000,0.000000
60681,ENSG00000277475,831,56,0.831,67.388688,3.231556


##### Saving the data frame as TSV file format

In [66]:
data_frame.to_csv("feature_counts_human_TPM.tsv",sep="\t")

##### Sorting the data frame according the TPM values

In [67]:
sorted_data_frame = data_frame.sort_values(by=["tpm"], ascending=False)

##### Saving the sorted data frame in TSV file format

In [68]:
sorted_data_frame.to_csv("featurecount_human_TPM_sorted.tsv",sep="\t")

In [69]:
sorted_data_frame

Unnamed: 0,Geneid,Length,Counts,length_kb,rpk_values,tpm
6876,ENSG00000034510,461,100130,0.461,217201.735358,10415.688268
57039,ENSG00000087086,871,150850,0.871,173191.733639,8305.233405
51999,ENSG00000184009,2920,503746,2.920,172515.753425,8272.817461
21695,ENSG00000075624,3413,472430,3.413,138420.744213,6637.825978
60611,ENSG00000198840,346,47882,0.346,138387.283237,6636.221391
...,...,...,...,...,...,...
26734,ENSG00000271660,310,0,0.310,0.000000,0.000000
26735,ENSG00000284377,1506,0,1.506,0.000000,0.000000
26736,ENSG00000216171,84,0,0.084,0.000000,0.000000
26737,ENSG00000207871,84,0,0.084,0.000000,0.000000
