# Logistic Regression

## Importing the libraries

In [1]:
import pandas as pd
import seaborn as sns

## Importing the dataset

In [2]:
dataset = pd.read_csv('breast_cancer.csv')

FileNotFoundError: [Errno 2] No such file or directory: 'breast_cancer.csv'

In [None]:
dataset.head()

In [None]:
dataset.info()

In [None]:
dataset.groupby('Class').describe()

In [None]:
dataset.describe()

In [None]:
#sns.distplot(a=None, bins=None, hist=True, kde=True, x=None)

In [None]:
sns.heatmap(dataset.drop('Sample code number', axis= 1).corr(), annot = True, fmt = ".2f")

In [None]:
 sns.countplot(data=dataset, x='Clump Thickness', palette="pastel", hue= 'Class')

In [None]:
dataset.groupby('Class').describe()

In [None]:
X = dataset.iloc[:, 1:-1].values 
y = dataset.iloc[:, -1].values

In [None]:
from sklearn.feature_selection import f_classif
from sklearn.feature_selection import SelectKBest
fs = SelectKBest(score_func=f_classif, k=2)
# apply feature selection
X_selected = fs.fit_transform(X, y)
print(X_selected.shape)

## Select Features With RFE (Recursive Feature Elimination)

In [None]:
# report which features were selected by RFE
from sklearn.feature_selection import RFE
from sklearn.tree import DecisionTreeClassifier
# define RFE
rfe = RFE(estimator=DecisionTreeClassifier(), n_features_to_select=6)
# fit RFE
rfe.fit(X, y)
for i in range(X.shape[1]):
    print('Column: %d, Selected=%s, Rank: %d' % (i, rfe.support_[i], rfe.ranking_[i]))

In [None]:
dataset.nunique()  #Columns that have a single observation or value are probably useless for modeling shoeld be removed

In [None]:
print(" the number of rows and columns", dataset.shape)
#The rows of duplicated data are identified and removed from the DataFrame.
dataset.drop_duplicates(inplace=True)
print(dataset.shape)

## Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.33, random_state = 1)
# summarize
print('Train', X_train.shape, y_train.shape)
print('Test', X_test.shape, y_test.shape)

## Training the Logistic Regression model on the Training set

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(random_state = 0)
model.fit(X_train, y_train)

## Predicting the Test set results

In [None]:
y_pred = model.predict(X_test)

## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)
print(cm)

## Computing the accuracy with k-Fold Cross Validation

In [None]:
from sklearn.model_selection import cross_val_score
accuracies_train = cross_val_score(estimator = model, X = X_train, y = y_train, cv = 10)  # 10 is the number of fold -- K-fold
print("Accuracy_Train: {:.2f} %".format(accuracies_train.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies_train.std()*100))

In [None]:
accuracies_test = cross_val_score(estimator = model, X = X_test, y = y_test, cv = 10)
print("Accuracy_Test: {:.2f} %".format(accuracies_test.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies_test.std()*100))

### Model evaluation
- Model is evaluated using 10-fold cross-validation.
- The evaluation procedure is repeated three times.
- The random seed for the cross-validation split is the repeat number (1, 2, or 3).

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import PowerTransformer
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
model = SVC(kernel='sigmoid', gamma='scale', C=0.1)
steps = [('i',SimpleImputer(strategy='median')), ('p',PowerTransformer()), ('m',model)]
pipeline = Pipeline(steps=steps)
m_scores = cross_val_score(pipeline, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
print('Accuracy: %.3f (%.3f)' % (np.mean(m_scores)*100, np.std(m_scores)*100))

### Model Built Using Chi-Squared Features:
We can use the chi-squared test to score the features and select the four most relevant features.

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.metrics import accuracy_score
# feature selection
def select_features(X_train, y_train, X_test):
	fs = SelectKBest(score_func=chi2, k=4) 
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs
 
	fs.fit(X_train, y_train)
	X_train_fs = fs.transform(X_train)
	X_test_fs = fs.transform(X_test)
	return X_train_fs, X_test_fs
# feature selection
X_train_fs, X_test_fs = select_features(X_train, y_train, X_test)
# fit the model
model = LogisticRegression(solver='lbfgs')
model.fit(X_train_fs, y_train)
# evaluate the model
y_pred = model.predict(X_test_fs)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy: %.2f' % (accuracy*100))

### Model Built Using Mutual Information Features:
We can repeat the experiment and select the top four features using a mutual information statistic.

In [None]:
# evaluation of a model fit using mutual information input features
from sklearn.feature_selection import mutual_info_classif
# feature selection
def select_features(X_train, y_train, X_test):
	fs = SelectKBest(score_func=mutual_info_classif, k=4)
	fs.fit(X_train, y_train)
	X_train_mic = fs.transform(X_train)
	X_test_mic = fs.transform(X_test)
	return X_train_fs, X_test_fs
X_train_mic, X_test_mic = select_features(X_train, y_train, X_test)
# fit the model
model.fit(X_train_mic, y_train)
# evaluate the model
y_pred_mic = model.predict(X_test_mic)
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred_mic)
print('Accuracy_mic: %.2f' % (accuracy*100))

### Evaluate the model using Grid Search Technique;

In [None]:
# get modeling pipelines to evaluate using Grid Search Technique,
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_selection import RFE
from matplotlib import pyplot

def get_pipelines(model):
	pipelines = list()
	# standardize
	p = Pipeline([('s',StandardScaler()), ('r', RFE(estimator=LogisticRegression(solver='liblinear'), n_features_to_select=10)), ('m',model)])
	pipelines.append(('std', p))
	# scale and power
	p = Pipeline([('s',MinMaxScaler((1,2))), ('p', PowerTransformer()), ('m',model)])
	pipelines.append(('power', p))
	# quantile
	p = Pipeline([('s',QuantileTransformer(n_quantiles=100, output_distribution='normal')), ('m',model)])
	pipelines.append(('quan', p))
	# discretize
	p = Pipeline([('s',KBinsDiscretizer(n_bins=10, encode='ordinal', strategy='uniform')), ('m',model)])
	pipelines.append(('kbins', p))
	# pca
	p = Pipeline([('s',PCA(n_components=7)), ('m',model)])
	pipelines.append(('pca', p))
	# svd
	p = Pipeline([('s',TruncatedSVD(n_components=7)), ('m',model)])
	pipelines.append(('svd', p))
	return pipelines

# evaluate a model
def evaluate_model(X, y, model):
	# define the cross-validation procedure
	cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
	# evaluate model
	scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
	return scores

# get the modeling pipelines
pipelines = get_pipelines(model)
# evaluate each pipeline
results, names = list(), list()
for name, pipeline in pipelines:
	# evaluate
	scores = evaluate_model(X, y, pipeline)
	# summarize
	print('>%s: %.3f  (%.3f)' % (name, np.mean(scores)*100, np.std(scores)*100))
	# store
	results.append(scores)
	names.append(name)
# plot the result
pyplot.boxplot(results, labels=names, showmeans=True)
pyplot.show()

In this case, we can see that the standardization with feature selection (RFE) offers an additional lift in accuracy from 95.58 percent to 96.682 percent, although the data scaling and power transform offers an additional lift in accuracy to 97.368 percent 