# Preprocessing
- The features dataset has columns for samplenames and rows for features
- This is annoying and a mismatch with the label info
- This script will transpose it and save it to the same folder with the _t suffix

## Defs/Imports

In [4]:
from pathlib import Path
import pandas as pd
import csv
features_df_path = Path(r"c:\data\SCANB.csv")
features_t_df_path = str(features_df_path).replace(features_df_path.stem, features_df_path.stem+"_t")
print(f"source file: {features_df_path}")
print(f"saving to: {features_t_df_path}")

source file: c:\data\SCANB.csv
saving to: c:\data\SCANB_t.csv


## Process

In [5]:
col_names = []
cols = []

with open(features_df_path) as f:
    reader = csv.reader(f, delimiter=',')
    for row in reader:
        col_name = row[0]
        col = row[1:]
        col_names.append(col_name)
        cols.append(col)
col_names[0] = "samplename"
series_list = []
for col, col_name in  zip(cols, col_names):
    seri = pd.Series(data=col, name=col_name)
    series_list.append(seri)
    
df_t = pd.concat(series_list, axis=1)
df_t.to_csv(features_t_df_path)
print("Done Transposing!")

## Make Sure it Worked By Operning the Files

In [7]:
df_before = pd.read_csv(features_df_path)
df_before_shape = df_before.shape
df_after = pd.read_csv(features_t_df_path)
df_after_shape = df_after.shape
print(f"shape before and after: {df_before_shape} and {df_after_shape}")

shape before and after: (9264, 3070) and (3069, 9266)


In [8]:
df_before.head()

Unnamed: 0.1,Unnamed: 0,F1,F2,F3,F4,F5,F6,F9,F10,F11,...,F2841repl,F2860repl,F2872repl,F2877repl,F2911repl,F2912repl,F2958repl,F3057repl,F3085repl,F3135repl
0,A2M,-1.098234,0.110096,-1.779393,0.26836,0.616851,3.433406,-0.424064,0.357813,0.629575,...,0.340017,-0.427134,-0.811695,1.15509,-0.873224,-0.186538,-0.071234,1.535495,-2.769459,1.348425
1,A4GNT,-0.615048,-0.615048,-0.615048,-0.615048,-0.615048,-0.615048,-0.615048,-0.615048,3.324828,...,2.004878,0.408918,-0.615048,2.120522,-0.615048,2.27478,0.598461,1.185567,-0.615048,-0.615048
2,AAAS,-1.670827,0.248934,-0.422635,0.236302,-0.988261,-1.263231,-2.147768,-1.700644,0.055263,...,-0.807723,-1.566462,1.859718,-1.24086,1.058475,-0.234717,-0.501922,0.165044,0.883363,-1.641798
3,AACS,-0.53902,0.032084,0.392158,0.56354,-0.083663,-0.748093,-0.999103,-1.358139,2.157231,...,0.241122,-0.216516,0.841555,-1.0195,-0.108282,0.508552,0.663004,-0.257163,-0.884103,0.819606
4,AADAC,0.098325,0.262344,-0.71119,-0.373168,0.067739,-0.71119,-0.71119,-0.183505,-0.021691,...,-0.71119,2.818603,-0.71119,-0.71119,-0.71119,-0.71119,-0.71119,-0.71119,-0.056693,-0.71119


In [9]:
df_after.head()

Unnamed: 0.1,Unnamed: 0,samplename,A2M,A4GNT,AAAS,AACS,AADAC,AAK1,AAMP,AARS,...,ZSCAN12,ZSCAN16,ZSCAN18,ZSCAN5A,ZW10,ZWINT,ZXDA,ZXDC,ZYX,ZZEF1
0,0,F1,-1.098234,-0.615048,-1.670827,-0.53902,0.098325,-1.989513,-0.376823,0.439949,...,-0.271914,-0.607976,-0.349919,-1.740601,0.586037,-0.067762,2.314161,0.481213,-0.277116,-1.183044
1,1,F2,0.110096,-0.615048,0.248934,0.032084,0.262344,-1.21169,-0.480422,-0.933772,...,0.316885,-0.130563,1.064711,0.578536,-0.270144,-0.381822,0.239147,-1.087827,1.399415,-1.203251
2,2,F3,-1.779393,-0.615048,-0.422635,0.392158,-0.71119,-0.210792,1.145994,1.938679,...,-2.060746,-1.840604,-1.081964,-0.807412,1.520787,1.68211,-1.5933,0.110658,0.481292,-0.641483
3,3,F4,0.26836,-0.615048,0.236302,0.56354,-0.373168,-0.839605,0.75386,1.562852,...,-0.812436,-0.836057,0.505783,-0.035067,-0.184722,1.09469,1.002772,0.294238,0.449014,0.854496
4,4,F5,0.616851,-0.615048,-0.988261,-0.083663,0.067739,-0.732227,-0.760831,-0.655655,...,1.349677,-0.1145,0.833477,0.179142,-0.989962,-2.177457,1.244214,1.526451,0.095333,0.700526
