<a href="https://colab.research.google.com/github/SaralaMuthu/Learntocode/blob/master/BPD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!python /content/fcbff.py -inpath='/content/bpd.csv' -thresh=0.5

In [3]:
#!/usr/bin/env python
# encoding: utf-8

In [4]:
import sys
import os
import argparse
import numpy as np
import pandas as pd

In [5]:
def entropy(vec, base=2):
	" Returns the empirical entropy H(X) in the input vector."
	_, vec = np.unique(vec, return_counts=True)
	prob_vec = np.array(vec/float(sum(vec)))
	if base == 2:
		logfn = np.log2
	elif base == 10:
		logfn = np.log10
	else:
		logfn = np.log
	return prob_vec.dot(-logfn(prob_vec))

In [6]:
def conditional_entropy(x, y):
	"Returns H(X|Y)."
	uy, uyc = np.unique(y, return_counts=True)
	prob_uyc = uyc/float(sum(uyc))
	cond_entropy_x = np.array([entropy(x[y == v]) for v in uy])
	return prob_uyc.dot(cond_entropy_x)

In [7]:
def mutual_information(x, y):
	" Returns the information gain/mutual information [H(X)-H(X|Y)] between two random vars x & y."
	return entropy(x) - conditional_entropy(x, y)

In [8]:
def symmetrical_uncertainty(x, y):
	" Returns 'symmetrical uncertainty' (SU) - a symmetric mutual information measure."
	return 2.0*mutual_information(x, y)/(entropy(x) + entropy(y))


In [9]:
def getFirstElement(d):
	"""
	Returns tuple corresponding to first 'unconsidered' feature
	
	Parameters:
	----------
	d : ndarray
		A 2-d array with SU, original feature index and flag as columns.
	
	Returns:
	-------
	a, b, c : tuple
		a - SU value, b - original feature index, c - index of next 'unconsidered' feature
	"""
	
	t = np.where(d[:,2]>0)[0]
	if len(t):
		return d[t[0],0], d[t[0],1], t[0]
	return None, None, None

In [10]:
def getNextElement(d, idx):
	"""
	Returns tuple corresponding to the next 'unconsidered' feature.
	
	Parameters:
	-----------
	d : ndarray
		A 2-d array with SU, original feature index and flag as columns.
	idx : int
		Represents original index of a feature whose next element is required.
		
	Returns:
	--------
	a, b, c : tuple
		a - SU value, b - original feature index, c - index of next 'unconsidered' feature
	"""
	t = np.where(d[:,2]>0)[0]
	t = t[t > idx]
	if len(t):
		return d[t[0],0], d[t[0],1], t[0]
	return None, None, None

In [11]:
def removeElement(d, idx):
	"""
	Returns data with requested feature removed.
	
	Parameters:
	-----------
	d : ndarray
		A 2-d array with SU, original feature index and flag as columns.
	idx : int
		Represents original index of a feature which needs to be removed.
		
	Returns:
	--------
	d : ndarray
		Same as input, except with specific feature removed.
	"""
	d[idx,2] = 0
	return d

In [12]:
def c_correlation(X, y):
	"""
	Returns SU values between each feature and class.
	
	Parameters:
	-----------
	X : 2-D ndarray
		Feature matrix.
	y : ndarray
		Class label vector
		
	Returns:
	--------
	su : ndarray
		Symmetric Uncertainty (SU) values for each feature.
	"""
	su = np.zeros(X.shape[1])
	for i in np.arange(X.shape[1]):
		su[i] = symmetrical_uncertainty(X[:,i], y)
	return su

In [13]:
def fcbf(X, y, thresh):
	"""
	Perform Fast Correlation-Based Filter solution (FCBF).
	
	Parameters:
	-----------
	X : 2-D ndarray
		Feature matrix
	y : ndarray
		Class label vector
	thresh : float
		A value in [0,1) used as threshold for selecting 'relevant' features. 
		A negative value suggest the use of minimum SU[i,c] value as threshold.
	
	Returns:
	--------
	sbest : 2-D ndarray
		An array containing SU[i,c] values and feature index i.
	"""
	n = X.shape[1]
	slist = np.zeros((n, 3))
	slist[:, -1] = 1

	# identify relevant features
	slist[:,0] = c_correlation(X, y) # compute 'C-correlation'
	idx = slist[:,0].argsort()[::-1]
	slist = slist[idx, ]
	slist[:,1] = idx
	if thresh < 0:
		thresh = np.median(slist[-1,0])
		print("Using minimum SU value as default threshold: {0}".format(thresh))
	elif thresh >= 1 or thresh > max(slist[:,0]):
		print("No relevant features selected for given threshold.")
		print("Please lower the threshold and try again.")
		exit()
		
	slist = slist[slist[:,0]>thresh,:] # desc. ordered per SU[i,c]
	
	# identify redundant features among the relevant ones
	cache = {}
	m = len(slist)
	p_su, p, p_idx = getFirstElement(slist)
	for i in range(m):
		p = int(p)
		q_su, q, q_idx = getNextElement(slist, p_idx)
		if q:
			while q:
				q = int(q)
				if (p, q) in cache:
					pq_su = cache[(p,q)]
				else:
					pq_su = symmetrical_uncertainty(X[:,p], X[:,q])
					cache[(p,q)] = pq_su

				if pq_su >= q_su:
					slist = removeElement(slist, q_idx)
				q_su, q, q_idx = getNextElement(slist, q_idx)
				
		p_su, p, p_idx = getNextElement(slist, p_idx)
		if not p_idx:
			break
	
	sbest = slist[slist[:,2]>0, :2]
	return sbest


In [14]:
def fcbf_wrapper(inpath, thresh, delim=',', header=False, classAt=-1):
	"""
	Main function call to perform FCBF selection. Saves Symmetric Uncertainty (SU)
	values and 0-based indices of selected features to a CSV file at the same location
	as input file, with 'feature_' as prefix. e.g. 'feature_pima.csv' for 'pima.csv'.
	
	Parameters:
	-----------
	inpath : str
		Path containing training set.
	thresh : float
		A value in [0,1) used as threshold for selecting 'relevant' features. 
		A negative value suggest the use of minimum SU[i,c] value as threshold.
	delim : str
		Character to be used to delimit input file. defaults to ','
	header : bool
		Whether the input file contains a header line. default to False.
	classAt : int
		0-based index of the class vector in the file. A value of -1 (default) 
		suggest to use last column.
	"""
	if os.path.exists(inpath):
		try:
			print("Reading file. Please wait ...")
			if header:
				d = pd.read_csv(inpath, delimiter=str(delim), header=0,engine='python')
			else:
				d = pd.read_csv(inpath, delimiter=str(delim),engine='python')
			print("Success! Dimensions: ",d.shape)
			p=d.shape[1]
		except Exception(e):
			print("Input file loading failed. Please check the file.")
			print("Error:", e)
			raise e
			exit()
		
		if classAt == -1:
			X = d.iloc[:, :d.shape[1]].values
			y = d.iloc[:,-1].values
		else:
			idx = np.arange(d.shape[1])
			X = d.iloc[:, idx[idx != classAt]].values
			y = d.iloc[:, classAt].values	

		try:
			print("Performing FCBF selection. Please wait ...")
			print('X: {}, y: {}'.format(X.shape, len(y)))
			sbest = fcbf(X, y, thresh)
			print("Done!")
			print("\n#Features selected: {0}".format(len(sbest)))
			print("Selected feature indices:\n{0}".format(sbest))
			try:
				outpath = os.path.split(inpath)[0] \
							+ '/features_' + os.path.split(inpath)[1]
				np.savetxt(outpath, sbest, fmt="%0.8f,%d", newline="\n", \
				 			header='SU, 0-based Feature')
				print("\nFile saved successfully. Path: {0}".format(outpath))
			except Exception(e):
				print("Error encountered while saving file:", e)
		except Exception(e):
			print("Error:", e)			
	else:
		print("The file you specified does not exist.")
	

In [None]:
def fcbf_wrapperr(inpath, thresh, delim=',', header=False, classAt=-1):
	"""
	Main function call to perform FCBF selection. Saves Symmetric Uncertainty (SU)
	values and 0-based indices of selected features to a CSV file at the same location
	as input file, with 'feature_' as prefix. e.g. 'feature_pima.csv' for 'pima.csv'.
	
	Parameters:
	-----------
	inpath : str
		Path containing training set.
	thresh : float
		A value in [0,1) used as threshold for selecting 'relevant' features. 
		A negative value suggest the use of minimum SU[i,c] value as threshold.
	delim : str
		Character to be used to delimit input file. defaults to ','
	header : bool
		Whether the input file contains a header line. default to False.
	classAt : int
		0-based index of the class vector in the file. A value of -1 (default) 
		suggest to use last column.
	"""
	if os.path.exists(inpath):
		try:
			print("Reading file. Please wait ...")
			if header:
				d = pd.read_csv(inpath, delimiter=str(delim), header=0,engine='python')
			else:
				d = pd.read_csv(inpath, delimiter=str(delim),engine='python')
			print("Success! Dimensions: ",d.shape)
			#p=d.shape[1]
		except Exception(e):
			print("Input file loading failed. Please check the file.")
			print("Error:", e)
			raise e
			exit()
		
		if classAt == -1:
			X = d.iloc[:, :d.shape[1]].values
			y = d.iloc[:,-1].values
		else:
			idx = np.arange(d.shape[1])
			X = d.iloc[:, idx[idx != classAt]].values
			y = d.iloc[:, classAt].values
		print(X.shape);print(y.shape)
   

In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df=pd.read_csv("/content/bpd.csv",delimiter=',',engine='python')
print(df.shape)
print(df.shape[0])
print(df.shape[1])

(54675, 90)
54675
90


In [36]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

df=pd.read_csv("/content/bpdFSclasss.csv",delimiter=',',engine='python')
print(df.shape)
X=df.T
print(X.shape)
X.head()
X.to_csv("test.csv", sep=',' ,header=None,index=False)



(54676, 89)
(89, 54676)


In [15]:
d=pd.read_csv("/content/test.csv",delimiter=',',engine='python')
print(d.shape)
d.head()


(88, 54676)


Unnamed: 0,1007_s_at,1053_at,117_at,121_at,1255_g_at,1294_at,1316_at,1320_at,1405_i_at,1431_at,...,AFFX-r2-Ec-bioD-5_at,AFFX-r2-P1-cre-3_at,AFFX-r2-P1-cre-5_at,AFFX-ThrX-3_at,AFFX-ThrX-5_at,AFFX-ThrX-M_at,AFFX-TrpnX-3_at,AFFX-TrpnX-5_at,AFFX-TrpnX-M_at,Class
0,6.781245,7.209577,10.224917,7.397761,2.800003,9.002311,4.469153,3.963464,11.682971,3.320923,...,11.507549,13.634878,13.354401,3.650186,3.61716,2.944756,2.864972,3.43205,3.32493,1.0
1,7.10554,7.04235,10.218934,7.826926,2.811085,8.908063,4.936953,3.884465,11.708093,3.385179,...,11.455276,13.642405,13.430102,3.721502,3.360808,2.993379,2.886092,3.43333,3.25563,1.0
2,7.176201,6.866277,9.967175,7.648635,2.88968,8.863025,4.714443,4.079035,11.551474,3.598951,...,11.526475,13.588778,13.374048,3.565228,3.435166,3.221675,2.845552,3.338108,3.221442,2.0
3,6.910155,7.107594,9.968917,7.515799,2.981619,8.886335,4.626546,3.902244,11.511711,3.536087,...,11.492295,13.666277,13.367172,3.664887,3.65663,3.092886,2.923808,3.425972,3.302892,2.0
4,7.229103,7.059184,9.251931,7.74245,2.853711,8.898094,4.598057,3.872721,12.303847,3.580876,...,11.555108,13.585962,13.367172,3.560082,3.49914,3.117756,2.92946,3.386154,3.339504,1.0


In [19]:
fcbf_wrapper('/content/test.csv', 0.03, ',',False,-1)

Reading file. Please wait ...
Success! Dimensions:  (88, 54676)
Performing FCBF selection. Please wait ...
X: (88, 54676), y: 88
Done!

#Features selected: 3
Selected feature indices:
[[1.00000000e+00 5.46750000e+04]
 [2.68715508e-01 2.57290000e+04]
 [2.65946869e-01 0.00000000e+00]]

File saved successfully. Path: /content/features_test.csv


In [44]:
fcbf_wrapper('/content/lungcancer.csv', 0.05)

Reading file. Please wait ...
Success! Dimensions:  (31, 57)
Performing FCBF selection. Please wait ...
X: (31, 57), y: 31
Done!

#Features selected: 4
Selected feature indices:
[[ 1.         56.        ]
 [ 0.18889273 55.        ]
 [ 0.0631983  10.        ]
 [ 0.05725075 53.        ]]

File saved successfully. Path: /content/features_lungcancer.csv


In [None]:
fcbf_wrapperr('/content/test.csv', 0.25, ',',False,-1)

Reading file. Please wait ...
Success! Dimensions:  (88, 54675)
(88, 54675)
(88,)
