# TMT-dataset-downsample
2.16.23

Want to make a downsampled version of the `Satpathy2020` and `Petralia2020` datasets. 
These are high-quality TMT dataset that would be good to include. 
But the original matrices are 53 and 103 Mbs, respectively.
I think that probably the way to go is to randomly select rows and columns to withhold. 
Don't do any sort of filtering by percent missingness. 

In [1]:
import pandas as pd
import numpy as np

# init the random number generator
rng = np.random.default_rng(seed=36)

#### Configs

In [2]:
data_path = "path/to/the/data"
dataset0 = "path/to/the/data"
dataset1 = "path/to/the/data"

n_rows = 40000
n_cols = 30

#### Read in, count up the MVs 
For both datasets. 

In [3]:
# read in 
df0 = pd.read_csv(data_path + dataset0)
df1 = pd.read_csv(data_path + dataset1)
print(df0.shape)
print(df1.shape)

# get the MV fractions
mv_count0 = np.count_nonzero(np.isnan(df0))
mv_frac0 = (df0.size - mv_count0) / df0.size

mv_count1 = np.count_nonzero(np.isnan(df1))
mv_frac1 = (df1.size - mv_count1) / df1.size

print(np.around(mv_frac0, 3))
print(np.around(mv_frac1, 3))

# count up the number of MVs by row
mv_x_row0 = np.isnan(df0).sum(axis=1)
mv_x_row1 = np.isnan(df1).sum(axis=1)

(291317, 35)
(110739, 226)
0.597
0.455


#### Randomly select rows to retain
For both datasets. 

In [4]:
# convert to numpy
df0 = np.array(df0)
df1 = np.array(df1)

# randomly select row indices
keep_r_idx0 = rng.integers(low=0, high=df0.shape[0], size=n_rows)
keep_r_idx1 = rng.integers(low=0, high=df1.shape[0], size=n_rows)

# subset
df0_sub = df0[keep_r_idx0, :]
df1_sub = df1[keep_r_idx1, :]

print(df0_sub.shape)
print(df1_sub.shape)

(40000, 35)
(40000, 226)


#### Randomly select columns to retain 

In [7]:
# randomly select col indices
keep_c_idx0 = rng.integers(low=0, high=df0.shape[1], size=n_cols)
keep_c_idx1 = rng.integers(low=0, high=df1.shape[1], size=n_cols)

# subset
df0_sub = df0_sub[:, keep_c_idx0]
df1_sub = df1_sub[:, keep_c_idx1]

print(df0_sub.shape)
print(df1_sub.shape)

(40000, 30)
(40000, 30)


#### Write

In [8]:
pd.DataFrame(df0_sub).to_csv("path/to/the/data", index=None)
pd.DataFrame(df1_sub).to_csv("path/to/the/data", index=None)