# Data Preprocessing and Feature Selection

Data preprocessing: Enables one to improve the quality of training sets.

Involves handling missing data, encoding categorical data including categorical classes, partitioning the data into training and test sets, scaling or standardizing the data, and selecting meaningful features.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [None]:
heart_df=pd.read_csv('python-data/heart_failure_clinical_records_dataset.csv')
heart_df.shape

In [None]:
heart_df.info()

In [None]:
heart_df['time'].sample(12)

In [None]:
heart_df.drop(['time'], inplace=True, axis=1)

In [None]:
heart_df.describe()

In [None]:
def scale_features(X): #similar to what MinMaxScaler does.
    return (X-min(X))/(max(X)-min(X))
    

In [None]:
age=scale_features(heart_df.age)

In [None]:
age[:5]

In [None]:
def standardize_features(X): #similar to what StandardScaler does.
    return (X-X.mean())/X.std()
    

In [None]:
age=standardize_features(heart_df.age)
age[:5]

In [None]:
np.max(age), np.min(age),np.mean(age),np.var(age)

In [None]:
X=heart_df.iloc[:,:11]
y=heart_df.iloc[:,11]
X.shape, heart_df.shape, y.shape

In [None]:
y[:4]

In [None]:
type(X)

In [None]:
X_copy=heart_df.iloc[:,:12]
X_copy.shape

In [None]:
#use apply() to standardize the features.
X=X.apply(standardize_features)

In [None]:
X_copy.apply(scale_features)

# Select meaningful features

Reducing complexity of models through dimensionality reduction using feature selection
a) <b>Feature selection dimensionality reduction</b>: selects a subset of the original features.
b) <b>Feature extraction dimensionality reduction</b>: derives information from the original features to create new ones.

Consider sequential feature selection algorithms: reduce initial n-dimensional feature space to k-dimensional feature subspace, k<n.

Requires features that are most relevant, remove irrelevant features, reduce noise.


##### a) Sequential Backward Selection(SBS)

Seeks to reduce dimensionality with minimum classifier performance degradation.
Sequentially removes features until the desired features are found. Uses a function to determine
the feature to remove.

In [None]:
#Implementation of SBS
from sklearn.base import clone
from itertools import combinations
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

class SBS():
    def __init__(self,estimator,k_features, scoring=accuracy_score, \
                 test_size=0.25, random_state=1):
        self.scoring=scoring
        self.estimator=clone(estimator) #creates a copy of the estimator with same 
                                        #parameters but not the fitted data
        self.k_features=k_features
        self.test_size=test_size
        self.random_state=random_state
        
    def fit(self, X,y):
        X_train, X_test, y_train, y_test=\
            train_test_split(X,y, test_size=self.test_size,random_state=self.random_state)
        dim=X_train.shape[1]
        self.indices_=tuple(range(dim))
        print("---indices",self.indices_)
        print("Data::",X_train.shape, X_test.shape, y_train.shape, y_test.shape)
        print("Data type::",type(X_train), type(y_train))
        self.subsets_=[self.indices_]
        print('---subsets_',self.subsets_)
        score=self._calc_score(X_train, y_train,X_test, y_test, self.indices_)
        self.scores_=[score]
        print("---scores_:",self.scores_)
        
        while dim>self.k_features:
            scores=[]
            subsets=[]
            
            for p in combinations(self.indices_, r=dim-1):
                print("p:",p)
                print(self.indices_)
                score=self._calc_score(X_train, y_train,X_test, y_test,p)
                scores.append(score)
                subsets.append(p)
                
            best=np.argmax(scores)
            self.indices_=subsets[best]
            self.subsets_.append(self.indices_)
            dim-=1
            
            self.scores_.append(scores[best])
        self.k_score_=self.scores_[-1]
        
        return self
    
    def transform(self, X):
        return X[:,self.indices_]
    
    def _calc_score(self, X_train, y_train, X_test, y_test, indices):
        #self.estimator.fit(X_train, y_train)
        self.estimator.fit(X_train[:,indices], y_train)
        #y_pred=self.estimator.predict(X_test)
        y_pred=self.estimator.predict(X_test[:,indices])
        score=self.scoring(y_test,y_pred)
        print('---a score::',score)
        return score
                

In [None]:
X=np.copy(X)
y=np.copy(y)

X_train, X_test, y_train, y_test=\
            train_test_split(X,y, test_size=0.3,random_state=0, stratify=y)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
knn=KNeighborsClassifier(n_neighbors=5)

sbs=SBS(knn, k_features=1)
sbs.fit(X_train,y_train) #why pass X_train and y_train
#we want SBS to create new training subsets for testing (validation) and training.
#Approach is used to avoid  original test data from being used as training set.

In [None]:
#Plot classification accuracy of KNN on the validation set.
k_feat=[len(k) for k in sbs.subsets_]

plt.plot(k_feat, sbs.scores_,marker='+')
plt.ylim([0.4,1.0])
plt.ylabel('Accuracy')
plt.xlabel('Number of features')
plt.grid()
plt.show()

In [None]:
#Which smallest feature set yielded the good scores (k=3)
#print(sbs.subsets_)
k3=list(sbs.subsets_[8])
print("sub sets:",k3)
print(heart_df.columns[:-1][k3])

In [None]:
#Evaluate the performance of the model on the original test set---this uses all feature sets
knn.fit(X_train,y_train)
print("Training accuracy: ",knn.score(X_train,y_train))
print("Test accuracy: ",knn.score(X_test,y_test))

In [None]:
#Evaluate the performance of the model on the original test set---this uses only the three
#significant features
knn.fit(X_train[:,k3],y_train)
print("Training accuracy: ",knn.score(X_train[:,k3],y_train))
print("Test accuracy: ",knn.score(X_test[:,k3],y_test))

In [None]:
#Which smallest feature set yielded the good scores (k=4)
#print(sbs.subsets_)
k4=list(sbs.subsets_[7])
print("sub sets:",k4)
print(heart_df.columns[:-1][k4])

In [None]:
#Evaluate the performance of the model on the original test set---this uses only the three
#significant features
knn.fit(X_train[:,k4],y_train)
print("Training accuracy: ",knn.score(X_train[:,k4],y_train))
print("Test accuracy: ",knn.score(X_test[:,k4],y_test))

<b>Observation</b>: Clearly the performance is better with reduced dataset from our heart failure dataset.

# Assessing feature importance with random forests

In [None]:
from sklearn.ensemble import RandomForestClassifier

feat_labels=heart_df.columns[:-1]
rf=RandomForestClassifier(n_estimators=500,random_state=1)
rf.fit(X_train,y_train)
importances=rf.feature_importances_
indices=np.argsort(importances)[::-1]
for f in range(X_train.shape[1]):
    print("{:2d} {:25s} {:.3f}".format(f+1, feat_labels[indices[f]], importances[indices[f]]))

plt.title('Feature Importance')
plt.bar(range(X_train.shape[1]), importances[indices], align='center')
plt.xticks(range(X_train.shape[1]),feat_labels[indices], rotation=90)
plt.xlim([-1,X_train.shape[1]])
plt.tight_layout()
plt.show()

<b>Observation</b>: ejection_fraction, and serum_creatinine are the top two predictors. 

# Select features with SelectFromModel

Useful when we need to use a Pipeline object. Based on a predefined threshold set by the user.

In [None]:
from sklearn.feature_selection import SelectFromModel
sfm=SelectFromModel(rf,threshold=0.15, prefit=True)
X_selected=sfm.transform(X_train)
X_test_selected=sfm.transform(X_test)
print("Number of samples that meet this criterion: ", X_selected.shape[0])

for f in range(X_selected.shape[1]):
    print("{:2d} {:25s} {:.3f}".format(f+1, feat_labels[indices[f]], importances[indices[f]]))

More on feature selction: https://scikit-learn.org/stable/modules/feature_selection.html 

# Debugging Algorithms

Uses <b>learning</b> and <b>validation</b> curves.

Diagnose <b>high variance (overfitting)</b> or <b>high bias (underfitting)</b> using <b>learning curves</b>.

Diagnose common issues with learning algorithms using <b>validation curves</b>.


# a) Learning Curves

Plots model training and validation accuracies as functions of the training set size.

Model with high bias (underfitting): Low training and cross-validation accuracy. Can be addressed by increasing the number of model parameters/features (either by more data collection or feature construction) or decreasing the degree of regularization.

Model with high variance(overfitting): Large gap between training and cross-validation accuracy. May be addressd by collecting more training data, reducing model complexity, or increasing the regularization parameter (for regularized models). You may reduce the number of features through feature selection or feature extraction to decrease overfitting for unegularized models.  Careful if the training examples are very noisy...in that case you may also need to reduce the noise.

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import learning_curve, StratifiedKFold,cross_val_score

lr_model=LogisticRegression(random_state=1)
#learning_curve uses stratified k-fold cross validation
train_sizes, train_scores, test_scores=learning_curve(estimator=lr_model, X=X_train, y=y_train,\
                                                     train_sizes=np.linspace(0.1,1.0,10),cv=10,n_jobs=1)
train_mean=np.mean(train_scores,axis=1)
train_std=np.std(train_scores,axis=1) #used to indicate the variance of estimate.

test_mean=np.mean(test_scores,axis=1)
test_std=np.std(test_scores,axis=1)

plt.plot(train_sizes, train_mean, color='red', marker='+', markersize=5, label='training accuracy')

plt.fill_between(train_sizes, train_mean+train_std, train_mean-train_std, alpha=0.15, color='red')

plt.plot(train_sizes, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')

plt.fill_between(train_sizes, test_mean+test_std, test_mean-test_std, alpha=0.15, color='green')

plt.grid()
plt.xlabel('Number of training examples')
plt.ylabel('Accuracy')
plt.title("Learning curves")
plt.legend(loc='lower right')
plt.ylim([0.5,1.0])
plt.show()

Question: How well does our model perform?

Good performance from around 100 samples. With fewer examples we have overfitting--the gap between training and validation accuracy increases.

# b) Validation curves

Used to improve model performance by addressing overfitting and underfitting.

Plots accuracy against variations in a model parameter.

Consider varying the regularization parameter c in logistic regression.

In [None]:
from sklearn.model_selection import validation_curve

param_range=[0.001, 0.01, 0.1,1.0,10.0, 100.0]
lr_model2=LogisticRegression(random_state=1)

print("Params::",lr_model2.get_params().keys())
#Uses stratified k-fold cross-validation.
train_scores, test_scores=validation_curve(estimator=lr_model2, X=X_train, y=y_train,\
                                                     param_name='C', \
                                         param_range=param_range,cv=10)
train_mean=np.mean(train_scores,axis=1)
train_std=np.std(train_scores,axis=1) #used to indicate the variance of estimate.

test_mean=np.mean(test_scores,axis=1)
test_std=np.std(test_scores,axis=1)


plt.plot(param_range, train_mean, color='red', marker='+', markersize=5, label='training accuracy')

plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15, color='red')

plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=5, label='validation accuracy')

plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15, color='green')

plt.grid()
plt.xscale('log')
plt.xlabel('C')
plt.ylabel('Accuracy')
plt.title("Validation curves---parameter:C")
plt.legend(loc='lower right')
plt.ylim([0.65,0.9])
plt.show()

Observation: Towards c=100, there seems to be overfitting. Between c=0.001 to c=0.01 seems the better range. 

In [None]:
from sklearn.model_selection import validation_curve

param_range=['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
lr_model2=LogisticRegression(random_state=1)

print("Params::",lr_model2.get_params().keys())
#Uses stratified k-fold cross-validation.
train_scores, test_scores=validation_curve(estimator=lr_model2, X=X_train, y=y_train,\
                                                     param_name='solver', \
                                         param_range=param_range,cv=10)
train_mean=np.mean(train_scores,axis=1)
train_std=np.std(train_scores,axis=1) #used to indicate the variance of estimate.

test_mean=np.mean(test_scores,axis=1)
test_std=np.std(test_scores,axis=1)


plt.plot(param_range, train_mean, color='red', marker='x', markersize=6, label='training accuracy')

plt.fill_between(param_range, train_mean+train_std, train_mean-train_std, alpha=0.15, color='red')

plt.plot(param_range, test_mean, color='green', linestyle='--', marker='s', markersize=6, label='validation accuracy')

plt.fill_between(param_range, test_mean+test_std, test_mean-test_std, alpha=0.15, color='green')

plt.grid()
plt.xlabel('solver')
plt.ylabel('Accuracy')
plt.title("Validation curves:::for solver parameter")
plt.xticks(range(len(param_range)),param_range, rotation=90)
plt.legend(loc='upper right')
plt.show()

# Multilayer Perceptron: With or without feature selection.

#### Initial Model with All Features

In [None]:
from sklearn.neural_network import MLPClassifier

#Model with all the features
mlp = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes=(10, 4), \
                    solver='lbfgs', activation='relu', batch_size=32)
#solver{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’
#activation{‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’

#Working with the original dataset after scaling
mlp.fit(X_train, y_train)
predictions=mlp.predict(X_test)
score=mlp.score(X_test, y_test)
score

#### Another Model with Selected Features

In [None]:
#Model with selected features
mlp = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes=(10,4), solver='lbfgs', activation='relu', batch_size=32)
#solver{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’
#activation{‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’

#Working with selected features.
mlp.fit(X_selected, y_train)
predictions=mlp.predict(X_test_selected)
score=mlp.score(X_test_selected, y_test)
score

# Other Feature Selection

Remove features with low variance

In [None]:
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=(.65 * (1 - .65)))
X_train_var_thresh=sel.fit_transform(X_train)
X_test_var_thresh=sel.transform(X_test)
X_train_var_thresh.shape, X_test_var_thresh.shape

In [None]:
mlp = MLPClassifier(random_state=1, max_iter=300, hidden_layer_sizes=(10, 4), solver='lbfgs', activation='relu', batch_size=32)
#solver{‘lbfgs’, ‘sgd’, ‘adam’}, default=’adam’
#activation{‘identity’, ‘logistic’, ‘tanh’, ‘relu’}, default=’relu’

#Working with selected features.
mlp.fit(X_train_var_thresh, y_train)
predictions=mlp.predict(X_test_var_thresh)
score=mlp.score(X_test_var_thresh, y_test)
score

# Compare Algorithms in Batch Mode 
This aims to identify promising candidate algorithms that should be explored further.

In [None]:
#pip install lazypredict

In [None]:
#pip install pycaret

# Implementing Perceptron and ADALINE learning strategies

In [None]:
from matplotlib.colors import ListedColormap

The following examples  have been derived(with little adjustment) from:

Sebastian Raschka & Vahid Mirjalili (2017), Python Machine Learning, 2nd Edition- Machine Learning and Deep Learning with Python,scikit-learn, and TensorFlow. Packt.

Link: https://cmu.primo.exlibrisgroup.com/permalink/01CMU_INST/6lpsnm/alma991019579188304436

# Implementing Perceptron classifier---problem should be linearly separable

In [None]:
class Perceptron(object):
    '''perceptron classifier
    
    paremeters:#set at the start
    miu: float (ranges 0 to 1)---learning rate
    n_iter: int (number of iterations aka epochs)---how many times to pass through the dataset. NUmber of epochs.
    random_state: int (random number generator seed for random weight initialization)
    
    attributes: #set at training
    w_: 1d-array (weights after fitting)
    errors_: list (number of misclassifications (updates) in each epoch)
    '''
    
    def __init__(self, miu=0.01,n_iter=5, random_state=1):
        self.miu=miu
        self.n_iter=n_iter
        self.random_state=random_state
        
    def fit(self, X, y):
        '''
        Learn from the training data
        parameters:
        X:{array_like}, shape={n_samples, n_features}--training vectors
        y:array-like, shape={n_samples}---target values---correct/ground truth values
        
        returns:
        self:object
        '''
        
        rgen=np.random.RandomState(self.random_state) #seeding allows producing previous results if needed.
        self.w_=rgen.normal(loc=0.0, scale=0.01, size=1+X.shape[1]) #we just want small random values.
                    #weights from normal distribution with stdev=0.01
        print("----weights----", self.w_)
        self.errors_=[]
        
        for _ in  range(self.n_iter):
            errors=0
            for xi, target in zip(X,y):
                update=self.miu*(target-self.predict(xi))
                self.w_[1:]+=update*xi
                
                self.w_[0]+=update*1  #self.w_[0] is weight of bias unit.
                
                errors+=int(update!=0.0)
            self.errors_.append(errors)
        return self
    
    def net_input(self,X): #result of wTx
        '''Calculate net input'''
        return np.dot(X,self.w_[1:])+self.w_[0]
    
    def predict(self, X):
        '''Return class label after unit step'''
        return np.where(self.net_input(X)>=0.0, 1, -1)

In [None]:
#Two class classification---setosa and versicolor.
df=pd.read_csv('python-data/iris_dataset.csv')
df.sample(6)

In [None]:
#Extract the first 100 instances---the setosas and versicolors
#convert the classes to -1(setosa), 1 (versicolor)
y=df.iloc[0:100,4].values
y=np.where(y=='Iris-setosa',-1,1)

#extract features
X=df.iloc[0:100,0:2].values

#plot the data for exploration
plt.scatter(X[:50,0], X[:50,1], color='red', marker='o', label='setosa')
plt.scatter(X[50:100,0], X[50:100,1], color='blue', marker='+', label='versicolor')
plt.xlabel('sepal length(cm)')
plt.ylabel('sepal width(cm)')
plt.legend(loc='upper right')
plt.show()

See the above plot shows that the data is linearly separable for two columns. Therefore, we can use a linear classifier like the perceptron

In [None]:
X.shape

In [None]:
df.describe()

In [None]:
#Let's train the perceptron
ppn=Perceptron(miu=0.1, n_iter=150)
ppn.fit(X,y)
plt.plot(range(1,len(ppn.errors_)+1),ppn.errors_, marker='+')
plt.xlabel('Epochs')
plt.ylabel('Number of updates')
plt.show()

In [None]:
#Visualize the decision boundary
def plot_decision_regions(X,y,classifier, step=0.01):
    #set up marker generator and color map
    markers=['s','x','o','^','v']
    colors=['red','blue', 'lightgreen','gray','cyan']
    cmap=ListedColormap(colors[:len(np.unique(y))])
    
    #plot the decision surface
    x1_min,x1_max=X[:,0].min()-1,X[:,0].max()+1
    x2_min,x2_max=X[:,1].min()-1,X[:,1].max()+1
    
    xx1,xx2=np.meshgrid(np.arange(x1_min,x1_max,step),np.arange(x2_min,x2_max,step))
    Z=classifier.predict(np.array([xx1.ravel(),xx2.ravel()]).T) #ravel flattens the array
    Z=Z.reshape(xx1.shape)
    
    plt.contourf(xx1,xx2,Z,alpha=0.1, cmap=cmap) #alpha: between 0(transparent) and 1(opaque)--change it and see the variations
    plt.xlim(xx1.min(),xx1.max())
    plt.ylim(xx2.min(),xx2.max())
    
    #Plot class samples
    for idx,cl in enumerate(np.unique(y)): #idx: index, cl: class (setosa, versicolor, etc)
        plt.scatter(x=X[y==cl,0], y=X[y==cl,1], \
                    alpha=0.8, c=colors[idx], label=cl, edgecolor='black')

In [None]:
plot_decision_regions(X,y,classifier=ppn)
plt.xlabel('sepal length(cm)')
plt.ylabel('sepal width(cm)')
plt.legend(loc='upper left')
plt.show()

# Implementing ADAptive LInear NEuron(ADALINE) using Batch Gradient Descent

In [None]:
class AdalineGD(object):
    '''ADAptive LInear NEuron classifier
    
    This solution uses batch gradient descent.
    
    paremeters:#set at the start
    miu: float (ranges 0 to 1)---learning rate
    
    n_iter: int (number of iterations aka epochs)---how many times to pass through the dataset. NUmber of epochs.
    
    random_state: int (random number generator seed for random weight initialization)
    
    attributes: #set at training
    w_: 1d-array (weights after fitting)
    
    cost_: list (sum of squares cost function value in each epoch)
    '''
    
    def __init__(self, miu=0.01,n_iter=5, random_state=1):
        self.miu=miu
        self.n_iter=n_iter
        self.random_state=random_state
        
    def fit(self, X, y):
        '''
        Learn from the training data
        parameters:
        X:{array_like}, shape={n_samples, n_features}--training vectors
        y:array-like, shape={n_samples}---target values---correct/ground truth values
        
        returns:
        self:object
        '''
        
        rgen=np.random.RandomState(self.random_state) #seeding allows producing previous results if needed.
        self.w_=rgen.normal(loc=0.0, scale=0.01, size=1+X.shape[1]) #we just want small random values.
                    #weights from normal distribution with stdev=0.01
        self.cost_=[]
        
        for _ in  range(self.n_iter):
            net_input=self.net_input(X)
            output=self.activation(net_input)
            errors=(y-output) 
            #print(errors)
            #print( "----output: ",output,"----errors: ",errors)
            self.w_[1:]+=self.miu*X.T.dot(errors)
            
            self.w_[0]+=self.miu*errors.sum() #self.w_[0] is weight of bias unit.
            
            cost=(errors**2).sum()/2.0
            self.cost_.append(cost)
        return self
    
    def net_input(self,X): #result of wTx---the summation
        '''Calculate net input'''
        return np.dot(X,self.w_[1:])+self.w_[0]
    
    def activation(self,X): #uses activation function, e.g. linear, sigmoid, relu, tanh,etc.
        '''Calculate linear activation'''
        return X #identity
    
    def predict(self, X):
        '''Return class label after unit step'''
        return np.where(self.activation(self.net_input(X))>=0.0, 1, -1)

In [None]:
#Plotting for different learning rates
fig,ax=plt.subplots(nrows=1,ncols=2, figsize=(10,4))

ada1=AdalineGD(n_iter=15, miu=0.01).fit(X,y)
ax[0].plot(range(1,len(ada1.cost_)+1),np.log10(ada1.cost_), marker='x') #len(ada1.cost_)--same as number of epochs
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('log(Sum-squared-error)')
ax[0].set_title('(a) Adaline-Learning rate=0.01')

ada2=AdalineGD(n_iter=15, miu=0.0001).fit(X,y)
ax[1].plot(range(1,len(ada2.cost_)+1),np.log10(ada2.cost_), marker='x')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('log(Sum-squared-error)')
ax[1].set_title('(b) Adaline-Learning rate=0.0001')

plt.show()

#In a) the errors are increasing because the learning rate is so large that it potentially overshoots the global minimum.
#In b), the learning rate is so small that it may requires so many epochs to converge to the global cost minimum.

#What happens if we scale the data? Let's use standardization x_new=(x_old-mean(x_values))/sdv(x_values)

In [None]:
X_std=np.copy(X) #copy the X values
X_std[:,0]=(X[:,0]-X[:,0].mean())/X[:,0].std() #this is what StandardScaler does
X_std[:,1]=(X[:,1]-X[:,1].mean())/X[:,1].std()

In [None]:
X_std.shape

In [None]:
X_std[:4,:]

In [None]:
#Plotting for different learning rates
fig,ax=plt.subplots(nrows=1,ncols=2, figsize=(10,4))

ada1=AdalineGD(n_iter=15, miu=0.01).fit(X_std,y)
ax[0].plot(range(1,len(ada1.cost_)+1),np.log10(ada1.cost_), marker='o')
ax[0].set_xlabel('Epochs')
ax[0].set_ylabel('log(Sum-squared-error)')
ax[0].set_title('(a) Adaline-Learning rate=0.01')

ada2=AdalineGD(n_iter=15, miu=0.0001).fit(X_std,y)
ax[1].plot(range(1,len(ada2.cost_)+1),np.log10(ada2.cost_), marker='o')
ax[1].set_xlabel('Epochs')
ax[1].set_ylabel('log(Sum-squared-error)')
ax[1].set_title('(b) Adaline-Learning rate=0.0001')

plt.show()

#See the behavior of a) and b). Also notice that standardized values have helped with the early convergence with 0.01 learning rate.

In [None]:
#Now let's plot the decision boundary
ada=AdalineGD(n_iter=15, miu=0.01).fit(X_std,y)

plot_decision_regions(X_std,y,classifier=ada)
plt.xlabel('sepal length(cm)')
plt.ylabel('sepal width(cm)')
plt.legend(loc='upper left')
plt.tight_layout()
plt.show()

In [None]:
#Plot the errors per epoch
plt.plot(range(1,len(ada.cost_)+1), ada.cost_,marker='s')
plt.xlabel('Epochs')
plt.ylabel('Sum-squared-error')
plt.title('Adaline-Learning rate=0.01, standardized data')
plt.show()

# Implementing ADAptive LInear NEuron(ADALINE) using Stochastic Gradient Descent

In [None]:
class AdalineSGD(object):
    '''ADAptive LInear NEuron classifier
    
    This solution uses stochastic gradient descent.
    
    paremeters:#set at the start
    miu: float (ranges 0 to 1)---learning rate
    
    n_iter: int (number of iterations aka epochs)---how many times to pass through the dataset. NUmber of epochs.
    
    random_state: int (random number generator seed for random weight initialization)
    
    shuffle: bool (default:True) (shuffles the the training data every epoch if True) to prevent cycles.
    
    attributes: #set at training
    w_: 1d-array (weights after fitting)
    
    cost_: list (sum of squares cost function value averaged over all training samples in each epoch)
    '''
    
    def __init__(self, miu=0.01,n_iter=5, shuffle=True, random_state=1):
        self.miu=miu
        self.n_iter=n_iter
        self.random_state=random_state
        self.shuffle=shuffle
        self.w_initialized=False
        
    def fit(self, X, y):
        '''
        Learn from the training data
        parameters:
        X:{array_like}, shape={n_samples, n_features}--training vectors
        y:array-like, shape={n_samples}---target values---correct/ground truth values
        
        returns:
        self:object
        '''
        self._initialize_weights(X.shape[1])
        self.cost_=[]
        
        for _ in  range(self.n_iter):
            if self.shuffle:
                X,y=self._shuffle(X,y)
            cost=[]
            for xi, target in zip(X,y):
                cost.append(self._update_weights(xi,target))
            avg_cost=sum(cost)/len(y)
            self.cost_.append(avg_cost)
        return self
    
    def partial_fit(self, X, y):
        '''
        Can be used in online learning scenarios with streaming data.
        
        Fits data without re-initializing the weights.
        
        Learn from the training data
        
        parameters:
        X:{array_like}, shape={n_samples, n_features}--training vectors
        y:array-like, shape={n_samples}---target values---correct/ground truth values
        
        returns:
        self:object
        '''
        if not self.w_initialized:
            self._initialize_weights(X.shape[1])
        if y.ravel().shape[0]>1:
            for xi,target in zip(X,y):
                self._update_weights(xi,target)
                
        else:
            self._update_weights(X,y)
        
        return self
    
    def _shuffle(self,X,y):
        '''
        Shuffles the training data
        '''
        r=self.rgen.permutation(len(y))
        return X[r],y[r]
        
        
    def _initialize_weights(self,m):
        '''
        Initializes weights to small random numbers
        '''
        self.rgen=np.random.RandomState(self.random_state) #seeding allows producing previous results if needed.
        self.w_=self.rgen.normal(loc=0.0, scale=0.01, size=1+m) #we just want small random values.
                    #weights from normal distribution with stdev=0.01
        self.w_initialized=True
    
    def _update_weights(self,xi,target):
        '''
        Apply Adaline learning rule to update the weights.
        '''
        output=self.activation(self.net_input(xi))
        error=(target-output) 
        print("----xi: ",xi, "----output: ",output,"----error: ",error)
        self.w_[1:]+=self.miu*xi.dot(error)
        self.w_[0]+=self.miu*error #self.w_[0] is weight of bias unit.
        cost=(error**2)/2.0
        return cost
       
    def net_input(self,X): #result of wTx---the summation
        '''Calculate net input'''
        return np.dot(X,self.w_[1:])+self.w_[0]
    
    def activation(self,X): #uses activation function
        '''Calculate linear activation'''
        return X
    
    def predict(self, X):
        '''Return class label after unit step'''
        return np.where(self.activation(self.net_input(X))>=0.0, 1, -1)

In [None]:
#test
adaSGD=AdalineSGD(n_iter=15, miu=0.01, random_state=1)
adaSGD.fit(X_std,y)

#plot the decision boundary
plot_decision_regions(X_std,y,classifier=adaSGD)
plt.xlabel('sepal length(cm)')
plt.ylabel('sepal width(cm)')
plt.legend(loc='upper left')
plt.title('Adaline SGD--standardized data')
plt.show()

#Plot the errors per epoch
plt.plot(range(1,len(ada.cost_)+1), ada.cost_,marker='+')
plt.xlabel('Epochs')
plt.ylabel('average cost')
plt.title('Adaline SGD: Learning rate=0.01, standardized data')
plt.show()


In [None]:
adaSGD.partial_fit(X_std[0,:],y[0]) #Online learning demo