<a href="https://colab.research.google.com/github/SaralaMuthu/Learntocode/blob/master/BPDFCBF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **FCBF Feature Selection**

bpdFSclasss.csv which is in the drive has been Transposed bpdtest.csv dataset with FCBF final program

In [26]:
!python /content/fcbff.py -inpath='/content/bpdtest.csv' -thresh=0.01

Reading file. Please wait ...
Success! Dimensions: 87 x 54676
Performing FCBF selection. Please wait ...
X: (87, 54675), y: 87
Done!

#Features selected: 2
Selected feature indices:
[[2.70252494e-01 6.25800000e+03]
 [2.66910180e-01 0.00000000e+00]]

File saved successfully. Path: /content/features_bpdtest.csv


**To Mount the GDrive**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


**Running utilities.py from https://github.com/shiralkarprashant/FCBF**

In [None]:
!python /content/utilities.py


Vec 1: [ 1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17. 18.
 19. 20.]
Entropy: 4.321928094887363
Vec 2: [4 5 6 7 4 5 6 7 4 5 6 7 4 5 6 7 4 5 6 7]
Entropy: 2.0
Mutual information: 2.0000000000000004
Symmetrical uncertainty: 0.6327183637591292


**Running FCBF.py from https://github.com/shiralkarprashant/FCBF as routines**

In [2]:
#!/usr/bin/env python
# encoding: utf-8

In [6]:
import sys
import os
import argparse
import numpy as np
import pandas as pd

In [7]:
def entropy(vec, base=2):
	" Returns the empirical entropy H(X) in the input vector."
	_, vec = np.unique(vec, return_counts=True)
	prob_vec = np.array(vec/float(sum(vec)))
	if base == 2:
		logfn = np.log2
	elif base == 10:
		logfn = np.log10
	else:
		logfn = np.log
	return prob_vec.dot(prob_vec)

In [8]:
def conditional_entropy(x, y):
	"Returns H(X|Y)."
	uy, uyc = np.unique(y, return_counts=True)
	prob_uyc = uyc/float(sum(uyc))
	cond_entropy_x = np.array([entropy(x[y == v]) for v in uy])
	return prob_uyc.dot(cond_entropy_x)

In [9]:
def mutual_information(x, y):
	" Returns the information gain/mutual information [H(X)-H(X|Y)] between two random vars x & y."
	return entropy(x) - conditional_entropy(x, y)

In [10]:
def symmetrical_uncertainty(x, y):
	" Returns 'symmetrical uncertainty' (SU) - a symmetric mutual information measure."
	return 2.0*mutual_information(x, y)/(entropy(x) + entropy(y))


In [11]:
def getFirstElement(d):
	"""
	Returns tuple corresponding to first 'unconsidered' feature

	Parameters:
	----------
	d : ndarray
		A 2-d array with SU, original feature index and flag as columns.

	Returns:
	-------
	a, b, c : tuple
		a - SU value, b - original feature index, c - index of next 'unconsidered' feature
	"""

	t = np.where(d[:,2]>0)[0]
	if len(t):
		return d[t[0],0], d[t[0],1], t[0]
	return None, None, None

In [12]:
def getNextElement(d, idx):
	"""
	Returns tuple corresponding to the next 'unconsidered' feature.

	Parameters:
	-----------
	d : ndarray
		A 2-d array with SU, original feature index and flag as columns.
	idx : int
		Represents original index of a feature whose next element is required.

	Returns:
	--------
	a, b, c : tuple
		a - SU value, b - original feature index, c - index of next 'unconsidered' feature
	"""
	t = np.where(d[:,2]>0)[0]
	t = t[t > idx]
	if len(t):
		return d[t[0],0], d[t[0],1], t[0]
	return None, None, None

In [13]:
def removeElement(d, idx):
	"""
	Returns data with requested feature removed.

	Parameters:
	-----------
	d : ndarray
		A 2-d array with SU, original feature index and flag as columns.
	idx : int
		Represents original index of a feature which needs to be removed.

	Returns:
	--------
	d : ndarray
		Same as input, except with specific feature removed.
	"""
	d[idx,2] = 0
	return d

In [14]:
def c_correlation(X, y):
	"""
	Returns SU values between each feature and class.

	Parameters:
	-----------
	X : 2-D ndarray
		Feature matrix.
	y : ndarray
		Class label vector

	Returns:
	--------
	su : ndarray
		Symmetric Uncertainty (SU) values for each feature.
	"""
	su = np.zeros(X.shape[1])
	for i in np.arange(X.shape[1]):
		su[i] = symmetrical_uncertainty(X[:,i], y)
		print(su[i])
	return su

In [15]:
def fcbf(X, y, thresh):
	"""
	Perform Fast Correlation-Based Filter solution (FCBF).

	Parameters:
	-----------
	X : 2-D ndarray
		Feature matrix
	y : ndarray
		Class label vector
	thresh : float
		A value in [0,1) used as threshold for selecting 'relevant' features.
		A negative value suggest the use of minimum SU[i,c] value as threshold.

	Returns:
	--------
	sbest : 2-D ndarray
		An array containing SU[i,c] values and feature index i.
	"""
	n = X.shape[1]
	slist = np.zeros((n, 3))
	slist[:, -1] = 1

	# identify relevant features
	slist[:,0] = c_correlation(X, y) # compute 'C-correlation'
	idx = slist[:,0].argsort()[::-1]
	slist = slist[idx, ]
	slist[:,1] = idx
	if thresh < 0:
		thresh = np.median(slist[-1,0])
		print("Using minimum SU value as default threshold: {0}".format(thresh))
	elif thresh >= 1 or thresh > max(slist[:,0]):
		print(max(slist[:,0]))
		print("No relevant features selected for given threshold.")
		print("Please lower the threshold and try again.")
		#exit()

	slist = slist[slist[:,0]>thresh,:] # desc. ordered per SU[i,c]

	# identify redundant features among the relevant ones
	cache = {}
	m = len(slist)
	p_su, p, p_idx = getFirstElement(slist)
	for i in range(m):
		p = int(p)
		q_su, q, q_idx = getNextElement(slist, p_idx)
		if q:
			while q:
				q = int(q)
				if (p, q) in cache:
					pq_su = cache[(p,q)]
				else:
					pq_su = symmetrical_uncertainty(X[:,p], X[:,q])
					cache[(p,q)] = pq_su

				if pq_su >= q_su:
					slist = removeElement(slist, q_idx)
				q_su, q, q_idx = getNextElement(slist, q_idx)

		p_su, p, p_idx = getNextElement(slist, p_idx)
		if not p_idx:
			break

	sbest = slist[slist[:,2]>0, :2]
	return sbest


In [16]:
def fcbf_wrapper(inpath, thresh, delim=',', header=False, classAt=-1):
	"""
	Main function call to perform FCBF selection. Saves Symmetric Uncertainty (SU)
	values and 0-based indices of selected features to a CSV file at the same location
	as input file, with 'feature_' as prefix. e.g. 'feature_pima.csv' for 'pima.csv'.

	Parameters:
	-----------
	inpath : str
		Path containing training set.
	thresh : float
		A value in [0,1) used as threshold for selecting 'relevant' features.
		A negative value suggest the use of minimum SU[i,c] value as threshold.
	delim : str
		Character to be used to delimit input file. defaults to ','
	header : bool
		Whether the input file contains a header line. default to False.
	classAt : int
		0-based index of the class vector in the file. A value of -1 (default)
		suggest to use last column.
	"""
	if os.path.exists(inpath):
		try:
			print("Reading file. Please wait ...")
			if header:
				d = pd.read_csv(inpath, delimiter=str(delim), header=0,engine='python')
			else:
				d = pd.read_csv(inpath, delimiter=str(delim),engine='python')
			print("Success! Dimensions: ",d.shape)
			p=d.shape[1]
		except Exception(e):
			print("Input file loading failed. Please check the file.")
			print("Error:", e)
			raise e
			#exit()

		if classAt == -1:
			X = d.iloc[:, :d.shape[1]-1].values
			y = d.iloc[:,-1].values
		else:
			idx = np.arange(d.shape[1])
			X = d.iloc[:, idx[idx != classAt]].values
			y = d.iloc[:, classAt].values

		try:
			print("Performing FCBF selection. Please wait ...")
			print('X: {}, y: {}'.format(X.shape, len(y)))
			sbest = fcbf(X, y, thresh)
			print("Done!")
			print("\n#Features selected: {0}".format(len(sbest)))
			print("Selected feature indices:\n{0}".format(sbest))
			try:
				outpath = os.path.split(inpath)[0] \
							+ '/features_' + os.path.split(inpath)[1]
				np.savetxt(outpath, sbest, fmt="%0.8f,%d", newline="\n", \
				 			header='SU, 0-based Feature')
				print("\nFile saved successfully. Path: {0}".format(outpath))
			except Exception(e):
				print("Error encountered while saving file:", e)
		except Exception(e):
			print("Error:", e)
	else:
		print("The file you specified does not exist.")


In [None]:
sbest = fcbf(X, y, thresh)

In [None]:
def fcbf_wrapperr(inpath, thresh, delim=',', header=False, classAt=-1):
	"""
	Main function call to perform FCBF selection. Saves Symmetric Uncertainty (SU)
	values and 0-based indices of selected features to a CSV file at the same location
	as input file, with 'feature_' as prefix. e.g. 'feature_pima.csv' for 'pima.csv'.

	Parameters:
	-----------
	inpath : str
		Path containing training set.
	thresh : float
		A value in [0,1) used as threshold for selecting 'relevant' features.
		A negative value suggest the use of minimum SU[i,c] value as threshold.
	delim : str
		Character to be used to delimit input file. defaults to ','
	header : bool
		Whether the input file contains a header line. default to False.
	classAt : int
		0-based index of the class vector in the file. A value of -1 (default)
		suggest to use last column.
	"""
	if os.path.exists(inpath):
		try:
			print("Reading file. Please wait ...")
			if header:
				d = pd.read_csv(inpath, delimiter=str(delim), header=0,engine='python')
			else:
				d = pd.read_csv(inpath, delimiter=str(delim),engine='python')
			print("Success! Dimensions: ",d.shape)
			#p=d.shape[1]
		except Exception(e):
			print("Input file loading failed. Please check the file.")
			print("Error:", e)
			raise e
			exit()

		if classAt == -1:
			X = d.iloc[:, :d.shape[1]-1].values
			y = d.iloc[:,-1].values
		else:
			idx = np.arange(d.shape[1])
			X = d.iloc[:, idx[idx != classAt]].values
			y = d.iloc[:, classAt].values
		#print(X.shape);print(y.shape)
	#print(X.shape[1]);
	su=np.zeros(X.shape[1]);
	print(su)




**Reading a High Dimensional Data**





In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df=pd.read_csv("/content/drive/MyDrive/BPData/bpdFSclasss.csv",delimiter=',',engine='python')
print(df.shape)
print(df.shape[0])
print(df.shape[1])

(54676, 89)
54676
89


**Read a HDD and transpose it so that features(gene expression) in columns as attributes and samples in rows as instances**

In [27]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df=pd.read_csv("/content/drive/MyDrive/BPData/bpdFSclasss.csv",delimiter=',',engine='python')
print(df.shape)
X=df.T
print(X.shape)
df.head()
X.to_csv("content/drive/MyDrive/BPData/bpdtest.csv", sep=',' ,header=None,index=False)



(54676, 89)
(89, 54676)


OSError: ignored

In [25]:
d=pd.read_csv("/content/bpdtest.csv",delimiter=',',engine='python')
print(d.shape)
d.head()

(88, 54676)


Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,Class
0,6.781245,7.209577,10.224917,7.397761,2.800003,9.002311,4.469153,3.963464,11.682971,3.320923,...,11.507549,13.634878,13.354401,3.650186,3.61716,2.944756,2.864972,3.43205,3.32493,1.0
1,7.10554,7.04235,10.218934,7.826926,2.811085,8.908063,4.936953,3.884465,11.708093,3.385179,...,11.455276,13.642405,13.430102,3.721502,3.360808,2.993379,2.886092,3.43333,3.25563,1.0
2,7.176201,6.866277,9.967175,7.648635,2.88968,8.863025,4.714443,4.079035,11.551474,3.598951,...,11.526475,13.588778,13.374048,3.565228,3.435166,3.221675,2.845552,3.338108,3.221442,2.0
3,6.910155,7.107594,9.968917,7.515799,2.981619,8.886335,4.626546,3.902244,11.511711,3.536087,...,11.492295,13.666277,13.367172,3.664887,3.65663,3.092886,2.923808,3.425972,3.302892,2.0
4,7.229103,7.059184,9.251931,7.74245,2.853711,8.898094,4.598057,3.872721,12.303847,3.580876,...,11.555108,13.585962,13.367172,3.560082,3.49914,3.117756,2.92946,3.386154,3.339504,1.0


**GeneSample Data created in excel using Random function**

In [5]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
d=pd.read_csv("/content/drive/MyDrive/BPData/genesample.csv",delimiter=',',engine='python')
print("Success! Dimensions: {0} x {1}".format(d.shape[0], d.shape[1]))
print(d.shape)
d.head()


Success! Dimensions: 199 x 30
(199, 30)


Unnamed: 0.1,Unnamed: 0,gene1,gene2,gene3,gene4,gene5,gene6,gene7,gene8,gene9,...,gene20,gene21,gene22,gene23,gene24,gene25,gene26,gene27,gene28,class
0,sample1,74,196,328,383,463,188,457,152,210,...,267,403,223,163,303,181,227,258,479,1
1,sample2,336,148,175,129,160,310,494,89,50,...,464,464,187,250,305,253,392,60,401,0
2,sample3,133,20,98,27,368,261,218,102,261,...,114,442,463,470,58,66,34,13,69,1
3,sample4,16,182,121,82,314,183,36,254,464,...,316,438,178,284,330,359,477,95,147,0
4,sample5,145,369,399,289,465,304,94,180,324,...,415,142,463,440,455,37,358,188,113,0


In [None]:
fcbf_wrapperr('/content/testoneeval.csv', 0.05, ',',False,-1)

Reading file. Please wait ...
Success! Dimensions:  (88, 500)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0



**Testing FCBF routine with Lung Cancer  data **



In [None]:
fcbf_wrapper('/content/test.csv', 0.05)

Reading file. Please wait ...
Success! Dimensions:  (31, 57)
Performing FCBF selection. Please wait ...
X: (31, 57), y: 31
Done!

#Features selected: 4
Selected feature indices:
[[ 1.         56.        ]
 [ 0.18889273 55.        ]
 [ 0.0631983  10.        ]
 [ 0.05725075 53.        ]]

File saved successfully. Path: /content/features_lungcancer.csv


**Testing FCBF Routine with GeneSample.csv which is created using random function values in excel**

In [25]:
fcbf_wrapper('/content/drive/MyDrive/BPData/genesample.csv',0.02, ',',False,-1)

Reading file. Please wait ...
Success! Dimensions:  (199, 30)
Performing FCBF selection. Please wait ...
X: (199, 29), y: 199
-0.019840478564307082
-0.021617925364966775
-0.019775176118675544
-0.018284988047155996
-0.01903831872760349
-0.020846719799051876
-0.02039317164473611
-0.01982970554453452
-0.020677677090411836
-0.020456858182704023
-0.021881501705798213
-0.02055800063252787
-0.020563300662108257
-0.01940928626609231
-0.021046919455452923
-0.01892561181735224
-0.02012611725469493
-0.0169421002946046
-0.02029183401319673
-0.017478857907135194
-0.019639897362050057
-0.01914560555470194
-0.01968177057550579
-0.021146936408025604
-0.01691107271252215
-0.020074332760846313
-0.018402456262996345
-0.019942187866293596
-0.01873961499493414
-0.01691107271252215
No relevant features selected for given threshold.
Please lower the threshold and try again.
Done!

#Features selected: 0
Selected feature indices:
[]

File saved successfully. Path: /content/drive/MyDrive/BPData/features_genesam

**Testing Exponent bpd dataset with one feature**

In [None]:
fcbf_wrapper('/content/testoneeval.csv',.005, ',',False,-1)

Reading file. Please wait ...
Success! Dimensions:  (88, 500)
Performing FCBF selection. Please wait ...
X: (88, 499), y: 88
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0.9883751651254954
0

Installation of Tensorflow

In [None]:
!pip install tensorflow



Upgrade Python Kernel

In [None]:
!pip install -q --upgrade ipython
!pip install -q --upgrade ipykernel

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/798.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m276.5/798.7 kB[0m [31m8.1 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m798.7/798.7 kB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.6 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m70.0 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
google-colab 1.0.0 requires ipython==7.34.0, but you have ipython 8.14.0 which is incompatible.[0m[31m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m152.8/152.8 kB[0m [31m4.8 MB/s

Testing FCBF Routine with Karthik Sir Data
https://github.com/karthiksekaran/autism-biomarker-discovery/blob/master/tSNR-logistic.py

In [None]:
fcbf_wrapper('/content/testt.csv',0.05,',',False,-1)

Reading file. Please wait ...
Success! Dimensions:  (38, 42)
Performing FCBF selection. Please wait ...
X: (38, 41), y: 38
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
Done!

#Features selected: 2
Selected feature indices:
[[ 1. 40.]
 [ 1.  0.]]

File saved successfully. Path: /content/features_testt.csv
