#Regression Feature Selection:
##(Numerical Input, Numerical Output)

In [None]:
# pearson's correlation feature selection for numeric input and numeric output
from sklearn.datasets import make_regression
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression

# generate dataset
X, y = make_regression(n_samples=100, n_features=100, n_informative=10)
print(X)
print(y)

In [None]:
# define feature selection
fs = SelectKBest(score_func=f_regression, k=10)

print(fs)

In [None]:
# apply feature selection
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)
print(X_selected)

#Classification Feature Selection:
##(Numerical Input, Categorical Output)

In [None]:
# ANOVA feature selection for numeric input and categorical output
from sklearn.datasets import make_classification



# generate dataset
X, y = make_classification(n_samples=100, n_features=20, n_informative=2)
print(X)
print(y)


In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

# define feature selection
fs = SelectKBest(score_func=f_classif, k=4)

# apply feature selection
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)
print(X_selected)

In [None]:
import pandas as pd

from google.colab import drive
drive.mount('/content/drive')

In [None]:
# load and summarize the dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
 
# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	return X, y
 
# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

In [None]:
# example of loading and preparing the breast cancer dataset
from pandas import read_csv
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder

from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from matplotlib import pyplot

# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)




# feature selection
def select_features(X_train, y_train, X_test):
	fs = SelectKBest(score_func=chi2, k='all')
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs, fs

# feature selection
X_train_fs, X_test_fs, fs = select_features(X_train_enc, y_train_enc, X_test_enc)
# what are scores for the features
for i in range(len(fs.scores_)):
	print('Feature %d: %f' % (i, fs.scores_[i]))
# plot the scores
pyplot.bar([i for i in range(len(fs.scores_))], fs.scores_)
pyplot.show()

 #Mutual information

In [None]:
# evaluation of a model fit using mutual information input features
from pandas import read_csv
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

# load the dataset
def load_dataset(filename):
	# load the dataset as a pandas DataFrame
	data = read_csv(filename, header=None)
	# retrieve numpy array
	dataset = data.values
	# split into input (X) and output (y) variables
	X = dataset[:, :-1]
	y = dataset[:,-1]
	# format all fields as string
	X = X.astype(str)
	return X, y

# prepare input data
def prepare_inputs(X_train, X_test):
	oe = OrdinalEncoder()
	oe.fit(X_train)
	X_train_enc = oe.transform(X_train)
	X_test_enc = oe.transform(X_test)
	return X_train_enc, X_test_enc

# prepare target
def prepare_targets(y_train, y_test):
	le = LabelEncoder()
	le.fit(y_train)
	y_train_enc = le.transform(y_train)
	y_test_enc = le.transform(y_test)
	return y_train_enc, y_test_enc

# feature selection
def select_features(X_train, y_train, X_test):
	fs = SelectKBest(score_func=mutual_info_classif, k=4)
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs

# load the dataset
X, y = load_dataset('breast-cancer.csv')
# split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)
# prepare input data
X_train_enc, X_test_enc = prepare_inputs(X_train, X_test)
# prepare output data
y_train_enc, y_test_enc = prepare_targets(y_train, y_test)
# feature selection
X_train_fs, X_test_fs = select_features(X_train_enc, y_train_enc, X_test_enc)
# fit the model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train_fs, y_train_enc)
# evaluate the model
yhat = model.predict(X_test_fs)
# evaluate predictions
accuracy = accuracy_score(y_test_enc, yhat)
print('Accuracy: %.2f' % (accuracy*100))  ## (run multiple times)avg should be arround 76%

#Personal testing

In [None]:
data_file = pd.read_csv( '/content/breast-cancer.data'  )  

data = pd.DataFrame(data_file)
# data.columns = ['Class', 'age', 'menopause', 'tumor-size',
                # 'inv-nodes', 'node-caps', 'deg-malig',
                # 'breast', 'breat-quad', 'irradiat' ]

print(data.iloc[0:5])

X = data.iloc[ : , 1:-1]
y = data.iloc[ : , 0:1]
print(X.iloc[0:5])
print(y.iloc[:5])

                  Class    age menopause  ... breast breat-quad irradiat
0  no-recurrence-events  30-39   premeno  ...   left   left_low       no
1  no-recurrence-events  40-49   premeno  ...  right   right_up       no
2  no-recurrence-events  40-49   premeno  ...   left   left_low       no
3  no-recurrence-events  60-69      ge40  ...  right    left_up       no
4  no-recurrence-events  40-49   premeno  ...  right  right_low       no

[5 rows x 10 columns]
     age menopause tumor-size inv-nodes node-caps  deg-malig breast breat-quad
0  30-39   premeno      30-34       0-2        no          3   left   left_low
1  40-49   premeno      20-24       0-2        no          2  right   right_up
2  40-49   premeno      20-24       0-2        no          2   left   left_low
3  60-69      ge40      15-19       0-2        no          2  right    left_up
4  40-49   premeno        0-4       0-2        no          2  right  right_low
                  Class
0  no-recurrence-events
1  no-recurrence-

In [None]:
# format all fields as string
X = X.astype(str)
print(X)

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1)

print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

Train (191, 8) (191, 1)
Test (95, 8) (95, 1)
