In [2]:
import sys
from timeit import default_timer as timer

import numpy as np
import pandas as pd
import statsmodels.stats.multitest as smm

# ignore warnings
import warnings

# warnings.filterwarnings("ignore")


In [3]:
# read the data
file = "../data/tf_scores_t.tsv"
df = pd.read_csv(file, sep="\t", index_col=0)
print("Orignal Data shape: ", df.shape)

Orignal Data shape:  (9089, 425)


In [4]:
single_df = df.copy()

comma_cols = single_df.columns[single_df.columns.str.contains(",")]

new_cols_df = pd.DataFrame()
for col in comma_cols:
    col_sp = col.split(",")
    new_cols = pd.concat([single_df[col]] * len(col_sp), axis=1)
    new_cols.columns = col_sp
    new_cols_df = pd.concat([new_cols_df, new_cols], axis=1)

single_df.drop(comma_cols, axis=1, inplace=True)
single_df = pd.concat([single_df, new_cols_df], axis=1)

single_df.head()


Unnamed: 0_level_0,ABCA3,ABCB1,ABL1,ACTL6A,AGO2,AHR,AIP,AKT1,ANKRD1,APC,...,SUPT7L,TBL1XR1,TBL1X,TNRC6C,MOV10,AGO3,AGO4,AGO1,TNRC6A,TNRC6B
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,0.0,-1.428313,1.702678,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,0.0,-0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,0.0,-0.420257,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [5]:
# Remove all the columns whose mean is 0 except the first column
single_df = single_df.loc[:, (single_df != 0).any(axis=0)] 
print("Data shape after removing columns with mean 0: ", single_df.shape)

Data shape after removing columns with mean 0:  (9089, 505)


In [6]:
# Remove all the rows whose mean is 0
single_df = single_df.loc[(single_df != 0).any(axis=1)]
print("Data shape after removing rows with mean 0: ", single_df.shape)

Data shape after removing rows with mean 0:  (9038, 505)


In [7]:
# 

In [8]:
# Normalize the data
single_df = single_df.apply(lambda x: (x - np.mean(x)) / np.std(x), axis=0)
single_df.head()

Unnamed: 0_level_0,ABCA3,ABCB1,ABL1,ACTL6A,AGO2,AHR,AIP,AKT1,ANKRD1,APC,...,SUPT7L,TBL1XR1,TBL1X,TNRC6C,MOV10,AGO3,AGO4,AGO1,TNRC6A,TNRC6B
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
BPK.12x.4NQO_AAACCTGCACCCAGTG.1,-0.014458,0.314976,0.149965,0.329,0.063104,-0.067509,0.02394,0.102421,-0.023238,-0.101592,...,-0.035036,-0.167138,-0.167138,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914
BPK.12x.4NQO_AAACCTGCAGCTTAAC.1,-0.014458,0.314976,0.149965,0.329,0.063104,-0.067509,0.02394,0.102421,-0.023238,-0.101592,...,-0.035036,-0.167138,-0.167138,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914
BPK.12x.4NQO_AAACCTGGTGTGCGTC.1,-0.014458,-1.049579,3.742781,0.329,0.063104,-0.067509,0.02394,0.102421,-0.023238,-0.101592,...,-0.035036,-0.167138,-0.167138,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914
BPK.12x.4NQO_AAACCTGGTTGAACTC.1,-0.014458,0.314976,0.149965,0.329,0.063104,-0.067509,0.02394,0.102421,-0.023238,-0.101592,...,-0.035036,-0.167138,-0.167138,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914
BPK.12x.4NQO_AAACGGGAGGATGGTC.1,-0.014458,-0.086521,0.149965,0.329,0.063104,-0.067509,0.02394,0.102421,-0.023238,-0.101592,...,-0.035036,-0.167138,-0.167138,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914,0.054914


In [9]:
# Create another dataframe using column names as first column and values as second column
row = pd.DataFrame()

row["Symbols"] = single_df.columns
row["SignedP"] = single_df.iloc[0:1, :].values[0]

row.shape

(505, 2)

In [10]:
for idx, row in single_df.iterrows():
    cell = pd.DataFrame()
    cell["Symbols"] = single_df.columns
    cell["SignedP"] = row.values
    print(cell.head())
    break
    

  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1  0.314976
2    ABL1  0.149965
3  ACTL6A  0.329000
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1  0.314976
2    ABL1  0.149965
3  ACTL6A  0.329000
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1 -1.049579
2    ABL1  3.742781
3  ACTL6A  0.329000
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1  0.314976
2    ABL1  0.149965
3  ACTL6A  0.329000
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1 -0.086521
2    ABL1  0.149965
3  ACTL6A  0.329000
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1  0.314976
2    ABL1 -0.944123
3  ACTL6A  0.329000
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1  0.314976
2    ABL1  0.149965
3  ACTL6A -1.714249
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458
1   ABCB1 -1.208400
2    ABL1  0.149965
3  ACTL6A  0.329000
4    AGO2  0.063104
  Symbols   SignedP
0   ABCA3 -0.014458


In [14]:
count = 0
for idx, row in single_df.iterrows():
    print(row)
    count += 1
    if count == 2:
        break

ABCA3     -0.014458
ABCB1      0.314976
ABL1       0.149965
ACTL6A     0.329000
AGO2       0.063104
             ...   
 AGO3      0.054914
 AGO4      0.054914
 AGO1      0.054914
 TNRC6A    0.054914
 TNRC6B    0.054914
Name: BPK.12x.4NQO_AAACCTGCACCCAGTG.1, Length: 505, dtype: float64
ABCA3     -0.014458
ABCB1      0.314976
ABL1       0.149965
ACTL6A     0.329000
AGO2       0.063104
             ...   
 AGO3      0.054914
 AGO4      0.054914
 AGO1      0.054914
 TNRC6A    0.054914
 TNRC6B    0.054914
Name: BPK.12x.4NQO_AAACCTGCAGCTTAAC.1, Length: 505, dtype: float64


In [2]:
import numpy as np

# Create an empty 2D array
empty_array_2d = np.empty((0, 3))  # 0 rows, 3 columns

# List to append
new_row = [1, 2, 3]

# Append the list as a new row to the 2D array
empty_array_2d = np.vstack([empty_array_2d, new_row])
empty_array_2d = np.vstack([empty_array_2d, new_row])

print(empty_array_2d)


[[1. 2. 3.]
 [1. 2. 3.]]
