# Import required libraries

In [12]:
import numpy as np
import math
import pandas
import scipy
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Question/Problem description

In [None]:
'''
Problem description:

For this problem, you need to download the Breast Cancer dataset from course webpage. 
The description of this dataset is in https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original). 
I have removed the records with missing values for you. Here, you will obtain the learning curves (accuracy vs. 
training data size). Implement a logistic regression classifier with the assumption that each attribute value 
for a particular record is independently generated. You should submit the code electronically to iCollege.

1.	(10 points) Briefly describe how you implement it by giving the pseudocode. The pseudocode must include 
equations for estimating the classification parameters and for classifying a new example. Re- member, this 
should not be a printout of your code, but a high-level outline.

2.  (15 points) Plot a learning curve: the accuracy vs. the size of the training data. Generate six points on 
the curve, using [.01 .02 .03 .125 .625 1] fractions of your training set and testing on the full test set each 
time. Average your results over 5 random splits of the data into a training and test set (always keep 2/3 of 
the data for training and 1/3 for testing, but randomize over which points go to training set and which to testing). 
This averaging will make your results less dependent on the order of records in the file. Specify your choice of 
regularization parameters and keep those parameters constant for these tests. A typical choice of constants would 
be λ = 0 (no regularization).

Attribute Information:

1. Sample code number: id number
2. Clump Thickness: 1 - 10
3. Uniformity of Cell Size: 1 - 10
4. Uniformity of Cell Shape: 1 - 10
5. Marginal Adhesion: 1 - 10
6. Single Epithelial Cell Size: 1 - 10
7. Bare Nuclei: 1 - 10
8. Bland Chromatin: 1 - 10
9. Normal Nucleoli: 1 - 10
10. Mitoses: 1 - 10
11. Class: (2 for benign, 4 for malignant)


In summary, what we need to get:

    - Obtain learning curves (accuracy vs training data size)
    - Implement Logistic Regression Classifier
	        Assume -> attribute values for particular record = independently generated
    - Equations for estimating the classification parameters
    - Equations for classifying a new example

Please note, code development was based on tutorial at https://realpython.com/logistic-regression-python/

'''

# 1. Access data including number of attributes, total number of samples - X values, total number of Y values, all in the form of a numpy array

In [13]:
#Load the data file
mat=scipy.io.loadmat('data_breastcancer.mat')
# print("full data: ", mat['data'])
print("keys: ", mat.keys())

#Number of samples
n = mat['data']['n'][0][0][0][0]
print("sample#: ", n)

#Number of attributes
d = mat['data']['d'][0][0][0][0]
print("attributes#: ", d)

#Input data - independent variables
X = mat['data']['X'][0][0]
print("input data", X)
print("shape: ", X.shape)
print("type: ", X.dtype, " and: ", type(X[0]))

#Output labels - dependent variables
Y = mat['data']['Y'][0][0]
# print("output labels: ", Y)
print("type: ", Y.dtype, " and: ", type(X[0]))

keys:  dict_keys(['__header__', '__version__', '__globals__', 'data'])
sample#:  683
attributes#:  9
input data [[ 5  1  1 ...  3  1  1]
 [ 5  4  4 ...  3  2  1]
 [ 3  1  1 ...  3  1  1]
 ...
 [ 5 10 10 ...  8 10  2]
 [ 4  8  6 ... 10  6  1]
 [ 4  8  8 ... 10  4  1]]
shape:  (683, 9)
type:  uint8  and:  <class 'numpy.ndarray'>
type:  uint8  and:  <class 'numpy.ndarray'>


# 2. Split test and training data

In [14]:
# Using the train test split function from sklearn to split data
x_train, x_test,y_train, y_test = train_test_split(X,Y,random_state=104, test_size=0.25, shuffle=True)

In [None]:
x_train

In [None]:
x_test

In [None]:
y_train

In [None]:
y_test

# 3. Create necessary functions

## Linear rule - Vector notation function representing the weights, inputs and bias

In [59]:
'''
Linear function f(x) = b0 + b1x1 + ... + brxr, ALSO CALLED THE LOGIT
Variables b0, b1, ..., br are the estimators of the regression 
    coefficients -> predicted weights (w) or just coefficients

Returns:
    Predicted Y values as an np.array in the form of linear shape
'''
def linear_rule(weights:np.array, inputs: np.array, b:float, n:int, d:int) -> np.array:
    # Create empty np.array for all predicted y values in linear form
    y_pred_arr = np.zeros((n,1))
    # print(y_pred_arr)
    # navigate through each array of x values and add them up after multiplication with weights
    for i in range(n):
        sum_x = 0
        for i in range(d):
            sum_x += np.sum(inputs[i]*weights[i])
        # print("sum_x: ", sum_x)
        
        # calculate individual y value for said array of x inputs
        y_pred_arr[i]= sum_x + b

    # Return array of predicted y values for each sample
    return y_pred_arr

## Activation fn -> sigmoid function for your binary logistic regression

In [65]:
'''
The logistic regression function p(x) is the sigmoid function of f(x): p(x) = 1 / (1 + exp(-f(x))
Result = Close to either 1 or 0.
Interpreted as  as predicted probability that output for given x is 1. So, 1-p(x) = probability output is 0.

Returns:
    Sigmoid value
'''
def sigmoid_fn(fn_vals:np.array, n:int, d:int)->np.array:
    sigm_arr = np.zeros((n,1))
    for i in range(n):
        sigm_arr[i] = 1/(1+math.exp(-fn_vals[i]))
    return sigm_arr

## MSE - Mean squared error function

In [72]:
''' 
MSE = average squared difference between estimated values and the actual value of y.
Given as MSE = ((summation(i=1) of (predicted_yi - actual_yi)^2) / n

Returns:
    MSE float value for all predicted Ys
    
'''
def mse_fn(y_predic_arr:np.array, y_vals:np.array, n:int) -> float:
    y_ave = 0
    for i in range(n):
        y_ave += (y_predic_arr[i] - y_vals[i])**2
    # Divide summation by the number of elements
    return y_ave/n

# Update weights function

In [77]:
def update_weights(weights:np.array, mse_val:float, d:int)-> np.array:
    new_w = np.zeros((d, 1))
    
    return new_w

## Gradient descent learning

In [None]:
''' 
Function that implements the first order iterative algorithm for finding
a local minimum of differentiable function -> gradient descent

Returns:
    Next theta parameter value
'''
def gradient_descent(x_input:np.array, learn_rate) -> np.array:


# 5. Get best weights for all observations via MLE

In [None]:
''' 
Use the arg max of the log-likelihood function l(w) = log L(w) for all observations i = 1 through n
AKA: Get the MLE for our observations -> summation of probabilitites for each observation
MLE function = l(w) = Summation(i=1) ln (P(Y=1 | Xi, W)) + (1-Yi)* ln (P(Y= | Xi, w))
MLE function = l(w) = Summation(i=1) Yi*(W0 + Summ(j=1) wjxj^i) - ln (1 + exp (W0 + Summ(j=1) wjxj^i))

returns:
    MLE
'''
def mle_fn(Yn, Xn, weights, X, w0):
    sum_i = 0
    sum_j = 0
    # Calculate summation of w0 + the summation of weights * x values ^ i
    for i in range(Yn):
        for j in range(Xn):
            sum_j += weights[j]*X[j]**i
        sum_i += Y[i]*(w0 + sum_j) - math.log(1 + math.exp(w0 + sum_j))
    
    # Return total MLE
    return sum_i


# Initiate values for parameters that need adjustment as well as others

In [35]:
# Initiate weights and b value so they can be adjusted - the slope of the line
weights = np.zeros(d)
print("weights: ", weights)

# b is the intercept, initiated to 0
b = 0
print("b: ", b)

# Select number of iterations for gradient descent
iterations = 150

# Select learning rate - can be set between 0.1 and 0.0001
learn_r = 0.1

# Variable to store predictions
y_pred = np.zeros(n)

weights:  [0. 0. 0. 0. 0. 0. 0. 0. 0.]
b:  0


# Proceed to use functions

In [68]:
# Use linear classification function
z = linear_rule(weights, x_train, b, n, d)
# print(z)

In [69]:
# Then make it a classification problem by using sigmoid function
h = sigmoid_fn(z, n, d)
# print(h)

In [71]:
# Calculate mean square error
mse = mse_fn(h, Y, n)
print(mse)

[0.25]


In [78]:
# Update weights after having calculated the average
weights = update_weights(weights, mse, d)
print(weights)

[[0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]
 [0.]]


In [None]:
for i in range(iterations):
    weights = update_weights(weights)

In [None]:


# # recall functions
# z = linear_classification_rule(weights, X, w0)
# h = sigmoid_fn(z)
# c = mle_fn(len(Y), len(X), weights, X, w0)

# cost = y

In [None]:
'''
Logistic regression determines the best predicted weights 𝑏₀, 𝑏₁, …, 𝑏ᵣ such that 
the function 𝑝(𝐱) is as close as possible to all actual responses 𝑦ᵢ, 𝑖 = 1, …, 𝑛, 
where 𝑛 is the number of observations. 

The process of calculating the best weights using available observations is called 
model training or fitting.

Proceed to fit the X and Y values into the model by using .fit() function, which 
takes x and y. The returned value is the model instance.
'''

In [18]:

print("input data", X[0][0])

input data 1


In [None]:
# To get the best weights, you usually maximize the log-likelihood function (LLF) 
# for all observations 𝑖 = 1, …, 𝑛. This method is called the maximum likelihood 
# estimation and is represented by the equation LLF = Σᵢ(𝑦ᵢ log(𝑝(𝐱ᵢ)) + (1 − 𝑦ᵢ) log(1 − 𝑝(𝐱ᵢ))).