**Initial Steps**

In [50]:
# Defining the file name from github
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter14/Dataset/ad.data'

In [51]:
import pandas as pd
# Loading the data using pandas

adData = pd.read_csv(filename,sep=",",header = None,error_bad_lines=False)
adData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1549,1550,1551,1552,1553,1554,1555,1556,1557,1558
0,125,125,1.0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
1,57,468,8.2105,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
2,33,230,6.9696,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
3,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.
4,60,468,7.8,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,ad.


In [3]:
# Seperating the dependent and independent variables
# Preparing the X variables
X = adData.loc[:,0:1557]
print(X.shape)
# Preparing the Y variable
Y = adData[1558]
print(Y.shape)


(3279, 1558)
(3279,)


In [4]:
import numpy as np
# Replacing special characters in first 3 columns which are of type object
for i in range(0,3):
  X[i] = X[i].str.replace("?", 'NaN').values.astype(float)
# Replacing special characters in the remaining columns which are of type integer
for i in range(3,1557):
  X[i] = X[i].replace("?", 'NaN').values.astype(float)  
# Imputing the 'nan'  with mean of the values
for i in range(0,1557):
  X[i] = X[i].fillna(X[i].mean())

In [5]:
# Normalising the data sets
# Normalising data
from sklearn import preprocessing
# Creating the scaling function
minmaxScaler = preprocessing.MinMaxScaler()
X_tran = pd.DataFrame(minmaxScaler.fit_transform(X))
X_tran.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1548,1549,1550,1551,1552,1553,1554,1555,1556,1557
0,0.194053,0.194053,0.016642,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.087637,0.730829,0.13682,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.050078,0.358372,0.116138,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.092332,0.730829,0.129978,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.092332,0.730829,0.129978,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Creating a high dimension data set
X_hd = pd.DataFrame(pd.np.tile(X_tran, (1, 2)))

print(X_hd.shape)

(3279, 3116)


**Adding noise to the dataset**

In [7]:
# Defining the mean and standard deviation
mu, sigma = 0, 0.1 


In [8]:
# Generating samples from the distribution
noise = np.random.normal(mu, sigma, [3279,3116]) 
noise.shape

(3279, 3116)

In [9]:
# Creating a new data set by adding noise
X_new = X_hd + noise


In [10]:
# Splitting data set into train and test sets
from sklearn.model_selection import train_test_split
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_new, Y, test_size=0.3, random_state=123)

print('Training set shape',X_train.shape)

print('Test set shape',X_test.shape)

Training set shape (2295, 3116)
Test set shape (984, 3116)


**Backward Elimination Method**

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE

# Defining the Classification function
backModel = LogisticRegression()
# Reducing dimensionality to 300 features for backward elimination model
rfe = RFE(backModel, 300)


In [12]:
# Fitting the rfe for selecting the top 300 features
import time
t0 = time.time()
rfe = rfe.fit(X_train, y_train)
t1 = time.time()
print("Backward Elimination time:", round(t1-t0, 3), "s")

Backward Elimination time: 751.981 s


In [13]:
# Transforming both train and test sets

X_train_tran = rfe.transform(X_train)

X_test_tran = rfe.transform(X_test)

print("Training set shape",X_train_tran.shape)

print("Test set shape",X_test_tran.shape)

Training set shape (2295, 300)
Test set shape (984, 300)


In [14]:
# Fitting the logistic regression model 
import time
# Defining the LogisticRegression function
RfeModel = LogisticRegression()
# Starting a timing function
t0=time.time()
# Fitting the model
RfeModel.fit(X_train_tran, y_train)
# Finding the end time 

print("Total training time:", round(time.time()-t0, 3), "s")





Total training time: 0.048 s


In [15]:
# Predicting on the test set and getting the accuracy
pred = RfeModel.predict(X_test_tran)

print('Accuracy of Logistic regression model after backward elimination: {:.2f}'.format(RfeModel.score(X_test_tran, y_test)))



Accuracy of Logistic regression model after backward elimination: 0.97


In [16]:
# Printing the Confusion matrix
from sklearn.metrics import confusion_matrix
confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

[[ 95  31]
 [  3 855]]


In [17]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         ad.       0.97      0.75      0.85       126
      nonad.       0.97      1.00      0.98       858

    accuracy                           0.97       984
   macro avg       0.97      0.88      0.91       984
weighted avg       0.97      0.97      0.96       984



**Forward Selection Method**

In [18]:
from sklearn.feature_selection import SelectKBest

# feature extraction
feats = SelectKBest(k=300)

In [19]:
 # Fitting the features for training set
import time
t0 = time.time()
fit = feats.fit(X_train, y_train)
t1 = time.time()
print("Forward selection fitting time:", round(t1-t0, 3), "s")

Forward selection fitting time: 0.165 s


In [20]:
# Creating new training set and test sets 

features_train = fit.transform(X_train)
features_test = fit.transform(X_test)

In [21]:
# Printing the shape of train and test sets before transformation
print('Train shape before transformation',X_train.shape)
print('Test shape before transformation',X_test.shape)

# Printing the shape of train and test sets after transformation
print('Train shape after transformation',features_train.shape)
print('Test shape after transformation',features_test.shape)

Train shape before transformation (2295, 3116)
Test shape before transformation (984, 3116)
Train shape after transformation (2295, 300)
Test shape after transformation (984, 300)


In [22]:
# Fitting a Logistic Regression Model
from sklearn.linear_model import LogisticRegression
import time

t0 = time.time()

forwardModel = LogisticRegression()
forwardModel.fit(features_train, y_train)

t1 = time.time()
print("Total training time:", round(t1-t0, 3), "s")

Total training time: 0.057 s


In [23]:
# Predicting with the forward model
pred = forwardModel.predict(features_test)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(forwardModel.score(features_test, y_test)))

Accuracy of Logistic regression model prediction on test set: 0.97


In [24]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

[[ 95  31]
 [  3 855]]


In [25]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         ad.       0.97      0.75      0.85       126
      nonad.       0.97      1.00      0.98       858

    accuracy                           0.97       984
   macro avg       0.97      0.88      0.91       984
weighted avg       0.97      0.97      0.96       984



**Principal Component Analysis**

In [26]:
from sklearn.decomposition import PCA
import time
t0 = time.time()
pca = PCA(n_components=300)
# Fitting the PCA on the training set
pca.fit(X_train)
t1 = time.time()
print("PCA fitting time:", round(t1-t0, 3), "s")

PCA fitting time: 1.723 s


In [27]:
# Transforming training set and test set
X_pca = pca.transform(X_train)
X_test_pca = pca.transform(X_test)

In [28]:
print("original shape of Training set:   ", X_train.shape)
print("original shape of Test set:   ", X_test.shape)
print("Transformed shape of training set:", X_pca.shape)
print("Transformed shape of test set:", X_test_pca.shape)

original shape of Training set:    (2295, 3116)
original shape of Test set:    (984, 3116)
Transformed shape of training set: (2295, 300)
Transformed shape of test set: (984, 300)


In [29]:
from sklearn.linear_model import LogisticRegression
import time

pcaModel = LogisticRegression()

t0 = time.time()
pcaModel.fit(X_pca, y_train)
t1 = time.time()

print("Total training time:", round(t1-t0, 3), "s")

Total training time: 0.046 s


In [30]:
# Predicting with the pca model
pred = pcaModel.predict(X_test_pca)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(pcaModel.score(X_test_pca, y_test)))

Accuracy of Logistic regression model prediction on test set: 0.97


In [31]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

[[100  26]
 [  3 855]]


In [32]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         ad.       0.97      0.79      0.87       126
      nonad.       0.97      1.00      0.98       858

    accuracy                           0.97       984
   macro avg       0.97      0.90      0.93       984
weighted avg       0.97      0.97      0.97       984



**Independent Component Analysis**

In [33]:
# Defining the ICA with number of components
from sklearn.decomposition import FastICA 
ICA = FastICA(n_components=300, random_state=123) 

In [34]:
# Fitting the ICA method and transforming the training set and noting the time
import time
t0 = time.time()
X_ica=ICA.fit_transform(X_train)
t1 = time.time()
print("ICA fitting time:", round(t1-t0, 3), "s")

ICA fitting time: 19.487 s


In [35]:
# Transfroming the test set 
X_test_ica=ICA.transform(X_test)

In [36]:
print("original shape of Training set:   ", X_train.shape)
print("original shape of Test set:   ", X_test.shape)
print("Transformed shape of training set:", X_ica.shape)
print("Transformed shape of test set:", X_test_ica.shape)

original shape of Training set:    (2295, 3116)
original shape of Test set:    (984, 3116)
Transformed shape of training set: (2295, 300)
Transformed shape of test set: (984, 300)


In [37]:
from sklearn.linear_model import LogisticRegression
import time

icaModel = LogisticRegression()

t0 = time.time()
icaModel.fit(X_ica, y_train)
t1 = time.time()

print("Total training time:", round(t1-t0, 3), "s")

Total training time: 0.043 s


In [38]:
# Predicting with the ica model
pred = icaModel.predict(X_test_ica)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(icaModel.score(X_test_ica, y_test)))

Accuracy of Logistic regression model prediction on test set: 0.87


In [39]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

[[  0 126]
 [  0 858]]


In [52]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         ad.       0.98      0.71      0.82       126
      nonad.       0.96      1.00      0.98       858

    accuracy                           0.96       984
   macro avg       0.97      0.85      0.90       984
weighted avg       0.96      0.96      0.96       984



**Factor Analysis**

In [41]:
# Defining the number of factors
from sklearn.decomposition import FactorAnalysis
fa = FactorAnalysis(n_components = 30,random_state=123)

In [42]:
# Fitting the Factor analysis method and transforming the training set
import time
t0 = time.time()
X_fac=fa.fit_transform(X_train)
t1 = time.time()
print("Factor analysis fitting time:", round(t1-t0, 3), "s")

Factor analysis fitting time: 1.485 s


In [43]:
# Transfroming the test set 
X_test_fac=fa.transform(X_test)

In [44]:
print("original shape of Training set:   ", X_train.shape)
print("original shape of Test set:   ", X_test.shape)
print("Transformed shape of training set:", X_fac.shape)
print("Transformed shape of test set:", X_test_fac.shape)

original shape of Training set:    (2295, 3116)
original shape of Test set:    (984, 3116)
Transformed shape of training set: (2295, 30)
Transformed shape of test set: (984, 30)


In [45]:
from sklearn.linear_model import LogisticRegression
import time

facModel = LogisticRegression()

t0 = time.time()
facModel.fit(X_fac, y_train)
t1 = time.time()

print("Total training time:", round(t1-t0, 3), "s")

Total training time: 0.025 s


In [46]:
# Predicting with the factor analysis model
pred = facModel.predict(X_test_fac)
print('Accuracy of Logistic regression model prediction on test set: {:.2f}'.format(facModel.score(X_test_fac, y_test)))

Accuracy of Logistic regression model prediction on test set: 0.96


In [47]:
# Generating confusion matrix
from sklearn.metrics import confusion_matrix

confusionMatrix = confusion_matrix(y_test, pred)
print(confusionMatrix)

[[ 89  37]
 [  2 856]]


In [48]:
from sklearn.metrics import classification_report
# Getting the Classification_report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

         ad.       0.98      0.71      0.82       126
      nonad.       0.96      1.00      0.98       858

    accuracy                           0.96       984
   macro avg       0.97      0.85      0.90       984
weighted avg       0.96      0.96      0.96       984

