In [None]:
import numpy as np
import pandas as pd

In [None]:
from sklearn import datasets
wine = datasets.load_wine()
# print(wine)
# print(wine.data)
# print(wine.target)
print(wine.data.shape)
print(wine.DESCR)

(178, 13)
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:       

In [None]:
# preprocessing
x = wine.data
y = wine.target
from sklearn import preprocessing
nor_x = preprocessing.normalize(x)
print(nor_x)

[[1.32644724e-02 1.59397384e-03 2.26512072e-03 ... 9.69434383e-04
  3.65402190e-03 9.92738094e-01]
 [1.25128005e-02 1.68733218e-03 2.02859038e-03 ... 9.95336401e-04
  3.22299406e-03 9.95336401e-01]
 [1.10630135e-02 1.98394467e-03 2.24454758e-03 ... 8.65874158e-04
  2.66487484e-03 9.96175609e-01]
 ...
 [1.57227449e-02 5.07108879e-03 2.67772446e-03 ... 6.99051960e-04
  1.84834078e-03 9.89336248e-01]
 [1.55136606e-02 3.05090212e-03 2.79175213e-03 ... 7.06772691e-04
  1.90828627e-03 9.89481768e-01]
 [2.48340486e-02 7.20591644e-03 4.81566123e-03 ... 1.07209976e-03
  2.81206495e-03 9.84222734e-01]]


In [None]:
# splinting data    train = 30%, test = 50%, validation = 20%
from sklearn.model_selection import train_test_split
x_total = nor_x
y_total = y

X_train,X_test,y_train,y_test = train_test_split(x_total,y_total,test_size = 0.5,random_state = 1)
X_train,X_validation,y_train,y_validation = train_test_split(X_train,y_train,test_size = 0.4,random_state = 1)

print(X_train.shape)
print(X_validation.shape)
print(X_test.shape)


(53, 13)
(36, 13)
(89, 13)


In [None]:
# to check classes in dataset
def num_Of_Classes(y):
  l = [0 for i in range(0,100)]
  for i in y:
    l[i] = l[i]+1
  count=0
  classes=[]
  for i in range(0,100):
    if l[i]!=0:
      count=count+1
      classes.append(i)
  return count,classes

In [None]:
numberClasses,classes = num_Of_Classes(wine.target)
print(numberClasses)
print(classes)

3
[0, 1, 2]


In [None]:
# set y for classifiers 
def set_y(y,targetClass):
  y_new=[]
  for i in y:
    if i == targetClass:
      y_new.append(1)
    else:
      y_new.append(0)
  return y_new

In [None]:
def getYs(y,classes):
  ys=[]
  for x in classes:
    ys.append(set_y(y,x))
  return np.array(ys)

In [None]:
# activation fucntion 
def sigmoid(x):
  return (1 / (1 + np.exp(-x)))



In [None]:
# weight intialization   
def getInitialWeights(numClasses,numFeatures):
  import random
  weights=[]
  for x in range(numClasses):
    w=[]
    for i in range(0,numFeatures+1):
      random.seed(i)
      w.append(random.uniform(0,1))
    weights.append(w)
  return np.array(weights)

# print(getInitialWeights(3,13))

In [None]:
# intercept 1
def add_intercept(X):
  X= np.hstack([np.ones((X.shape[0],1)), X])
  return X

In [None]:
# Cost function
def J(X,y,weights):
  m = X.shape[0]
  h= sigmoid(np.matmul(X,weights))
  cost= (-1/m)*sum((y * np.log(h) + (1 - y) * np.log(1 - h)))
  return cost

In [None]:
# weight updation  after each epoch
def updateWeights(X,y,weights,alpha,epochs,rho):

  cost_array=[]
  epochs_array=[0]
  cost_array.append(J(X,y,weights))
  m = X.shape[0]
  for i in range(epochs):

    h = sigmoid(np.matmul(X,weights))

    grad = (1/m)*(np.matmul(np.transpose(X),(h- y)))

    weights = weights-alpha*grad
    cost_array.append(J(X,y,weights))
    epochs_array.append(i+1)
    if abs(cost_array[i-1]- cost_array[i]) < rho:
      break      
    cost= cost_array[len(cost_array)-1]    
  return cost,cost_array,weights,epochs_array
  

In [None]:
# traing model
def LogisticRegression(X,Y,alpha=1,epochs=100,rho=0.001):
  X_new = add_intercept(X)
  # no of classess
  numClasses,classes = num_Of_Classes(Y)
  numFeatures = X.shape[1]

  # w0,w1,w2---initial 
  weights = getInitialWeights(numClasses,numFeatures)
  Ys = getYs(Y,classes)
  final_weights=[]
  cost_array=[]
  epochs_array=[]

  for x in range(numClasses):
    cost,cA,W,e = updateWeights(X_new,Ys[x],weights[x],alpha,epochs,rho)
    final_weights.append(W)
    cost_array.append(cA)
    epochs_array.append(e)
  
  cost_array = np.array(cost_array)
  # print(cost_array.shape)
  epochs_array = np.array(epochs_array)
  # print(epochs_array.shape)
  import matplotlib.pyplot as plt
  # %matplotlib inline
  # plt.plot(epochs_array[0],cost_array[0])

  return final_weights

In [None]:
# testing model
weights = LogisticRegression(X_test,y_test,alpha = 1, rho = 0.01)
print(weights)

[array([ 0.00248687,  0.10447535,  0.94978207,  0.23245695,  0.18415467,
        0.39856005,  0.78898154,  0.3208177 ,  0.22564718,  0.45960118,
        0.56093841,  0.45026121,  0.46915408, -0.555107  ]), array([ 0.02510229,  0.12871606,  0.95418351,  0.23691077,  0.23142322,
        0.57836515,  0.79275549,  0.32391397,  0.22659997,  0.46300194,
        0.56304069,  0.45246971,  0.47478536, -0.56311231]), array([-0.28127299,  0.11410389,  0.95417978,  0.23435378,  0.20830169,
        0.46231774,  0.7883793 ,  0.3177584 ,  0.22627458,  0.45911746,
        0.57052401,  0.45022843,  0.46802712, -0.85100121])]




In [None]:
# k fold 


from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import KFold,train_test_split
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

 
#Loading the data 
X = wine.data
y = wine.target
 
#Implementing cross validation
 
k = 5
kf = KFold(n_splits=k, random_state=None)
clf = linear_model.LogisticRegression()
 
acc_score = []

# spliting in folds
for train_index , test_index in kf.split(X):
  X_train,X_test,y_train,y_test = train_test_split(x_total,y_total,test_size = 0.5,random_state = 1)
  X_train,X_validation,y_train,y_validation = train_test_split(X_train,y_train,test_size = 0.4,random_state = 1)

 
  clf.fit(X_train,y_train)
  y_pred = clf.predict(X_test)
  acc = accuracy_score(y_pred , y_test)
  acc_score.append(acc)
        
  avg_acc_score = sum(acc_score)/k
 
print('accuracy of each fold - {}'.format(acc_score))
print('Avg accuracy : {}'.format(avg_acc_score))

accuracy of each fold - [0.38202247191011235, 0.38202247191011235, 0.38202247191011235, 0.38202247191011235, 0.38202247191011235]
Avg accuracy : 0.38202247191011235
