# Import required libraries

In [1]:
import numpy as np
import pandas
import scipy
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

# Question/Problem description

In [None]:
'''
Problem description:

For this problem, you need to download the Breast Cancer dataset from course webpage. 
The description of this dataset is in https://archive.ics.uci.edu/ml/datasets/breast+cancer+wisconsin+(original). 
I have removed the records with missing values for you. Here, you will obtain the learning curves (accuracy vs. 
training data size). Implement a logistic regression classifier with the assumption that each attribute value 
for a particular record is independently generated. You should submit the code electronically to iCollege.

1.	(10 points) Briefly describe how you implement it by giving the pseudocode. The pseudocode must include 
equations for estimating the classification parameters and for classifying a new example. Re- member, this 
should not be a printout of your code, but a high-level outline.

2.  (15 points) Plot a learning curve: the accuracy vs. the size of the training data. Generate six points on 
the curve, using [.01 .02 .03 .125 .625 1] fractions of your training set and testing on the full test set each 
time. Average your results over 5 random splits of the data into a training and test set (always keep 2/3 of 
the data for training and 1/3 for testing, but randomize over which points go to training set and which to testing). 
This averaging will make your results less dependent on the order of records in the file. Specify your choice of 
regularization parameters and keep those parameters constant for these tests. A typical choice of constants would 
be λ = 0 (no regularization).

Attribute Information:

1. Sample code number: id number
2. Clump Thickness: 1 - 10
3. Uniformity of Cell Size: 1 - 10
4. Uniformity of Cell Shape: 1 - 10
5. Marginal Adhesion: 1 - 10
6. Single Epithelial Cell Size: 1 - 10
7. Bare Nuclei: 1 - 10
8. Bland Chromatin: 1 - 10
9. Normal Nucleoli: 1 - 10
10. Mitoses: 1 - 10
11. Class: (2 for benign, 4 for malignant)


In summary, what we need to get:

    - Obtain learning curves (accuracy vs training data size)
    - Implement Logistic Regression Classifier
	        Assume -> attribute values for particular record = independently generated
    - Equations for estimating the classification parameters
    - Equations for classifying a new example

Please note, code development was based on tutorial at https://realpython.com/logistic-regression-python/

'''

# 1. Access data including number of attributes, total number of samples - X values, total number of Y values, all in the form of a numpy array

In [1]:
#Load the data file
mat=scipy.io.loadmat('data_breastcancer.mat')
# print("full data: ", mat['data'])
print("keys: ", mat.keys())

#Number of samples
n = mat['data']['n'][0][0][0][0]
print("sample#: ", n)

#Number of attributes
d = mat['data']['d'][0][0][0][0]
print("attributes#: ", d)

#Input data
X = mat['data']['X'][0][0]
print("input data", X)
print("shape: ", X.shape)
print("type: ", X.dtype, " and: ", type(X[0]))

#Output labels
Y = mat['data']['Y'][0][0]
# print("output labels: ", Y)
print("type: ", Y.dtype, " and: ", type(X[0]))

NameError: name 'scipy' is not defined

# 2. Create the sigmoid function for your binary logistic regression

In [None]:
# The logistic regression function 𝑝(𝐱) is the sigmoid function of 𝑓(𝐱): 𝑝(𝐱) = 1 / (1 + exp(−𝑓(𝐱))

# 2. Create and define the desired classification model

In [None]:
'''
Create an instance of logistic regression and bind references to the variable model 

Solver is a string ('liblinear' by default) that decides what solver to use for fitting 
the model. Other options are 'newton-cg', 'lbfgs', 'sag', and 'saga'.

Random_state is an integer, an instance of numpy. RandomState, or None (default) that defines 
what pseudo-random number generator to use.

'''

In [8]:
model = LogisticRegression(solver='liblinear', random_state=0)

# 3. Train the model

In [None]:
'''
Logistic regression determines the best predicted weights 𝑏₀, 𝑏₁, …, 𝑏ᵣ such that 
the function 𝑝(𝐱) is as close as possible to all actual responses 𝑦ᵢ, 𝑖 = 1, …, 𝑛, 
where 𝑛 is the number of observations. 

The process of calculating the best weights using available observations is called 
model training or fitting.

Proceed to fit the X and Y values into the model by using .fit() function, which 
takes x and y. The returned value is the model instance.
'''

In [9]:
model.fit(X,Y)

  y = column_or_1d(y, warn=True)


# 4. Get the attributes of your model

In [12]:
# Use the .classes_ function to check for the array of distint values that y takes:
# aka. is the result binary - binary classification
print("y values: ", model.classes_)

# Use .intercept_ to check value of slope b1 and the intercept b0 of the linear function:
print(model.intercept_, model.coef_)

y values:  [0 1]
[-6.25906995] [[ 0.24515516  0.16142571  0.27161639  0.17946675 -0.04158784  0.34698954
   0.1815928   0.19130598  0.20634236]]


# 5. Evaluate the model

In [None]:
'''
.predict_proba(x) allows you to check the performance of the model. 
It returns a matrix of probabilities where each row corresponds to a single
observation, and the first column indicates whether the output is 0 [1-p(x)]
and the second if the output is 1 [p(x)]

'''

In [14]:
print(model.predict_proba(X))

# .predict(x) -> Get actual predictions, based on probability matrix and the values of p(x).

[[0.96137797 0.03862203]
 [0.12912459 0.87087541]
 [0.96636146 0.03363854]
 ...
 [0.01256907 0.98743093]
 [0.0576589  0.9423411 ]
 [0.03108921 0.96891079]]


In [18]:

print("input data", X[0][0])

input data 1


In [None]:
# To get the best weights, you usually maximize the log-likelihood function (LLF) 
# for all observations 𝑖 = 1, …, 𝑛. This method is called the maximum likelihood 
# estimation and is represented by the equation LLF = Σᵢ(𝑦ᵢ log(𝑝(𝐱ᵢ)) + (1 − 𝑦ᵢ) log(1 − 𝑝(𝐱ᵢ))).