# Using Ridge Regression for Classification on Breast Cancer and Wine Datasets

I will be using Ridge Regression on classification problems: Breast Cancer and Wine datasets. First I will solve them "by hand" (not using Ridge Regression's implementations on any library). Later, I will solve the problems using the Scikit library and compare the results obtained from my "by hand" solution with the results obtained from the solution using the Scikit library

# By Hand

### Defining functions to be used

In [1]:
from sklearn.datasets import load_breast_cancer # taking included data set from Sklearn http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html
from sklearn import preprocessing # preprossing is what we do with the data before we run the learning algorithm
from sklearn.model_selection import train_test_split 
import numpy as np

def sigmoid(z):
        
    if (isinstance(z, int)):
        sigmoid = 1/(1 + np.exp((-1)*z))
    else:
        sigmoid = np.zeros(((z.shape[0]),1))
        for i in range(z.shape[0]):
            sigmoid[i][0] = 1/(1 + np.exp((-1)*z[i]))
    return sigmoid


# Hypothesis function
def hypothesis(X , w):

    h = np.zeros(((X.shape[0]),(w.shape[1])))
    for i in range (X.shape[0]):
        value = 0
        for j in range (X.shape[1]):
            value = value + X[i][j] * w[j]
        h[i] = value
    hfinal = np.zeros((h.shape))
    hfinal = sigmoid(h)
    return hfinal
    
# Log likelihood function now with lambda hyperparameter
def log_likelihood(X , y , w, lamb):
    
    h = hypothesis(X, w)
    log_likelihood = 0
    for i in range(X.shape[0]):
        log_likelihood = log_likelihood + (y[i]* np.log(h[i]) + (1-y[i])*np.log(1-h[i])) 
    
    reg = lamb*(np.dot(np.transpose(w),w))

    return (log_likelihood - reg)

# Gradient descent function
def Logistic_Regresion_Gradient_Ascent(X, y, learning_rate, num_iters, lamb):
    # For every 100 iterations, store the log_likelihood for the current w
    # Initializing log_likelihood to be an empty list  
    log_likelihood_values = []
    # Initialize w to be a zero vector of shape x_train.shape[1],1
    w = np.zeros((X.shape[1], 1))
    # Initialize N to the number of training examples
    N = X.shape[0] 
    
    for i in range (0, num_iters):
        h = hypothesis(X, w)
        
        for j in range (w.shape[0]):
            sum = 0
            for k in range (0, N):
                 sum = sum + (y[k]-h[k])*X[k][j]
            w[j] = w[j] + sum*learning_rate/N
        
        if (i % 100) == 0:
            log_likelihood_values.append(log_likelihood(X,y,w, lamb))
        
    return w, log_likelihood_values

### Breast Cancer Dataset

In [2]:
# Loading the dataset
cancer = load_breast_cancer()
y = cancer.target
X = cancer.data

# train_test_split to split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Append a column of ones to x_train 
ones = np.ones(X_train.shape[0]).reshape((X_train.shape[0], 1))
X_train = np.hstack((ones, X_train))

# Append a column of ones to x_test
ones = np.ones(X_test.shape[0]).reshape((X_test.shape[0], 1))
X_test = np.hstack((ones, X_test))

# Initialize parameters w
w = np.zeros((X_train.shape[1], 1))

# Set hyperparameters
learning_rate = 0.5
num_iters = 5000
lamb = 10

w, log_likelihood_values = Logistic_Regresion_Gradient_Ascent(X_train, y_train, learning_rate, num_iters, lamb)

# Evaluating the model
h = np.zeros(((X_test.shape[0]),(w.shape[1])))
h = hypothesis(X_test , w)

# Predicting on test data
predicted = np.zeros(((h.shape[0]),1))

for i in range (h.shape[0]):
    if h[i] >= 0.50:
        predicted[i] = 1
    else:
        predicted[i] = 0
correct = 0

for j in range (predicted.shape[0]):
    if predicted[j] == 1 and y_test[j] == 1 or predicted[j] == 0 and y_test[j] == 0:
        correct+=1


# Reporting accuracy
accuracy = correct/h.shape[0]

print("Accuracy:", accuracy*100)

Accuracy: 97.2027972027972


### Wine Dataset

In [3]:
from numpy import genfromtxt
# Loading the dataset
data = genfromtxt('datasets/wine.csv', delimiter=',') # reading in the data matrix

# data preprocessing (Only using 2 classes of wine)
n =  data.shape[0]
feat = data.shape[1]
data = data[data[:,0]<3]
X = data[0:n,1:feat]
y = data[0:n,0]
y = np.where(y<2,0,1)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

# Scaling Data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Append a column of ones to x_train 
ones = np.ones(X_train.shape[0]).reshape((X_train.shape[0], 1))
X_train = np.hstack((ones, X_train))

# Append a column of ones to x_test
ones = np.ones(X_test.shape[0]).reshape((X_test.shape[0], 1))
X_test = np.hstack((ones, X_test))

# Initialize parameters w
w = np.zeros((X_train.shape[1], 1))

# Set hyperparameters
learning_rate = 0.5
num_iters = 5000
lamb = 10

w, log_likelihood_values = Logistic_Regresion_Gradient_Ascent(X_train, y_train, learning_rate, num_iters, lamb)

# Evaluating the model
h = np.zeros(((X_test.shape[0]),(w.shape[1])))
h = hypothesis(X_test , w)

# Predicting on test data
predicted = np.zeros(((h.shape[0]),1))

for i in range (h.shape[0]):
    if h[i] >= 0.50:
        predicted[i] = 1
    else:
        predicted[i] = 0
correct = 0

for j in range (predicted.shape[0]):
    if predicted[j] == 1 and y_test[j] == 1 or predicted[j] == 0 and y_test[j] == 0:
        correct+=1

# Reporting accuracy
accuracy = correct/h.shape[0]

print("Accuracy:", accuracy*100)

Accuracy: 96.96969696969697


# Scikit Implementation

###  Breast Cancer Dataset

In [4]:
# Importing important libraries
from sklearn.datasets import load_breast_cancer # taking included data set from Sklearn http://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html
from sklearn import preprocessing
from sklearn.model_selection import train_test_split 
import numpy as np
from sklearn.linear_model import SGDClassifier

# Loading the dataset
cancer = load_breast_cancer()
y = cancer.target
X = cancer.data

# train_test_split to split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

# Scaling data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Ridge Regression
ridge_reg = SGDClassifier(loss='log', alpha=0.5, max_iter=5000, shuffle=False )
ridge_reg.fit(X_train, y_train)

# Predicting on test data
score = ridge_reg.score(X_test, y_test)
print("Accuracy:", score*100)

Accuracy: 97.2027972027972


### Wine Dataset

In [5]:
# Importing data
from numpy import genfromtxt
data = genfromtxt('datasets/wine.csv', delimiter=',') # reading in the data matrix

# data preprocessing (Only using 2 classes of wine)
n =  data.shape[0]
feat = data.shape[1]
data = data[data[:,0]<3]
X = data[0:n,1:feat]
y = data[0:n,0]
y = np.where(y<2,0,1)

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y,random_state=42)

# Scaling Data
scaler = preprocessing.StandardScaler().fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# Ridge regression
ridge_reg = SGDClassifier(loss='log', alpha=0.5, max_iter=5000, shuffle=False )
ridge_reg.fit(X_train, y_train)

score = ridge_reg.score(X_test, y_test)
print("Accuracy:", score*100)

Accuracy: 96.96969696969697


### Results

I obtained the same accuracies with both implementations ("by hand" and Scikit) on both datasets (Breast Cancer and Wine datasets)