In [15]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.preprocessing import  MinMaxScaler

import warnings
warnings.simplefilter(action='ignore',  category=FutureWarning)

In [16]:
# Load data file
meta = pd.read_csv('Data/flux-test.csv')
meta.head()

Unnamed: 0,bcr_patient_barcode,C2_1,C2_2,C2_3,C2_4,C2_5,C2_6,C2_7,C2_8,C2_9,...,C2_13073,C2_13074,C2_13075,C2_13076,C2_13077,C2_13078,C2_13079,C2_13080,C2_13081,C2_13082
0,TCGA-S9-A7J2,0.0,0.0,0,0,0,0,0.0,1000.0,-962.186018,...,0,0,0,0,0,0.0,0,0.0,0,0
1,TCGA-C8-A1HL,0.0,0.0,0,0,0,0,0.0,79.141165,122.308826,...,0,0,0,0,0,0.0,0,0.0,0,0
2,TCGA-EW-A2FS,0.0,0.0,0,0,0,0,0.0,1000.0,-921.280541,...,0,0,0,0,0,0.0,0,0.0,0,0
3,TCGA-A2-A3XX,0.0,0.0,0,0,0,0,0.0,71.898737,217.641532,...,0,0,0,0,0,0.0,0,0.0,0,0
4,TCGA-BH-A0BQ,0.0,0.0,0,0,0,0,0.0,998.612976,-766.787397,...,0,0,0,0,0,0.0,0,0.0,0,0


In [17]:
# Check shape of data
meta.shape

(1213, 13083)

In [18]:
# Remove row with all zero -> no row
x= meta.loc[~(meta==0).all(axis=1)]
x.shape


(1213, 13083)

In [19]:
# Check missing values
meta.isna().sum().sum()

0

In [22]:
# Check duplicates in the data
meta['bcr_patient_barcode'].duplicated().sum().sum()


119

In [23]:
# Remove duplicates
meta= meta.drop_duplicates(subset=['bcr_patient_barcode'], keep='first')
meta['bcr_patient_barcode'].duplicated().sum().sum()

0

In [24]:
# Normalise data to prepare for feature selection
meta1 = meta.drop(['bcr_patient_barcode'], axis=1)
nm = MinMaxScaler()
meta_n = nm.fit_transform(meta1)
meta_n =pd.DataFrame(meta_n, columns=meta1.columns)
meta_n['bcr_patient_barcode'] = meta['bcr_patient_barcode']
meta_n.head()

Unnamed: 0,C2_1,C2_2,C2_3,C2_4,C2_5,C2_6,C2_7,C2_8,C2_9,C2_10,...,C2_13074,C2_13075,C2_13076,C2_13077,C2_13078,C2_13079,C2_13080,C2_13081,C2_13082,bcr_patient_barcode
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.718811,0.430435,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TCGA-S9-A7J2
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.465318,0.641109,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TCGA-C8-A1HL
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.718811,0.438381,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TCGA-EW-A2FS
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.463324,0.659628,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TCGA-A2-A3XX
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.718429,0.468393,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,TCGA-BH-A0BQ


In [25]:
# Check variance
check =  pd.DataFrame(meta_n.var(),  columns=['Var'])

# Set variance number
n = 99.5
# Variance quantile to drop all based on median
med_var = np.quantile(check['Var'], n/100)
sel = check[check['Var'] > med_var] 
sel.shape

(65, 1)

In [26]:
# rank variance
check.sort_values('Var', ascending=False)
check.describe()

Unnamed: 0,Var
count,13082.0
mean,0.025655
std,0.051456
min,0.0
25%,0.0
50%,0.001713
75%,0.017051
max,0.248852


In [27]:
meta_sel = meta[sel.index]  
meta_sel['bcr_patient_barcode'] = meta['bcr_patient_barcode']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  meta_sel['bcr_patient_barcode'] = meta['bcr_patient_barcode']


In [28]:
meta_sel.shape

(1094, 66)

In [29]:
meta_sel.head()

Unnamed: 0,C2_1707,C2_4603,C2_4712,C2_4988,C2_4992,C2_6305,C2_6368,C2_6545,C2_6579,C2_6877,...,C2_11370,C2_11371,C2_11372,C2_11979,C2_12581,C2_12583,C2_12688,C2_12689,C2_12691,bcr_patient_barcode
0,-1000.0,1000.0,1000.0,1000.0,-1000.0,1000.0,0.0,-1000.0,1000.0,-1000.0,...,1000.0,1000.0,1000.0,-1000.0,-1000.0,1000.0,-1000.0,-1000.0,1000.0,TCGA-S9-A7J2
1,-993.607311,993.607311,-1000.0,1000.0,-1000.0,-1000.0,-1000.0,-993.607311,993.607311,-1000.0,...,1000.0,1000.0,1000.0,-1000.0,1000.0,-1000.0,1000.0,1000.0,-1000.0,TCGA-C8-A1HL
2,-967.627195,967.627195,1000.0,1000.0,-1000.0,-1000.0,1000.0,-967.627195,967.627195,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,-1000.0,1000.0,1000.0,-1000.0,TCGA-EW-A2FS
3,-1000.0,1000.0,-1000.0,1000.0,-1000.0,-1000.0,-1000.0,-1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,-1000.0,-1000.0,1000.0,-1000.0,-1000.0,1000.0,TCGA-A2-A3XX
4,-1000.0,1000.0,-1000.0,1000.0,-1000.0,1000.0,1000.0,-1000.0,1000.0,1000.0,...,1000.0,1000.0,1000.0,1000.0,1000.0,-1000.0,1000.0,1000.0,-1000.0,TCGA-BH-A0BQ


In [30]:
# Extract data
meta_sel.to_csv(f'Data/FS_Var.csv', index =False)