# Libraries

In [1]:
import numpy as np
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
iris = datasets.load_iris()
list(iris.keys())

['data', 'feature_names', 'target', 'target_names', 'DESCR']

In [3]:
print(iris.DESCR)

Iris Plants Database

Notes
-----
Data Set Characteristics:
    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20  0.76     0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :Date: July, 1988

This is a copy of UCI ML iris d

In [4]:
print(iris.target) #gives a detailed descriptipon of the Iris dataset

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


# Merging data features

In [5]:
data = np.array(iris['data'])
# print(data)
data_with_labels=np.insert(data, 0, values=iris['target'], axis=1) # first element is the class label
# print(data_with_labels)

# Creation of Test and Train dataset

In [6]:
train_set, test_set=train_test_split(data_with_labels,test_size=0.2,random_state=42)

# Binary Classification (Iris-Virginica or not)

In [7]:
X=train_set[:,(3,4)] # taking feature petal length and petal width
Y=(train_set[:,0]==2).astype(np.int) # to map true and false to 1 and 0 respectively
# print(X)
# print(Y)
test_data=test_set[:,(3,4)] # taking feature petal length and petal width
test_labels=(test_set[:,0]==2).astype(np.int)

In [8]:
# Y=(Y==2).astype(np.int) # to map true and false to 1 and 0 respectively
# print(Y)

# Nearest Neighbours

In [9]:
predicted_labels=[]
for i in range(len(test_data)):
    # euclidean distance
    minimum_distance=((np.dot(test_data[i],test_data[i]))-2*(np.dot(test_data[i],X[0]))+(np.dot(X[0],X[0])))**0.5
    closest_neighbour=Y[0]
    for j in range(1,len(X)):
        # euclidean distance
        distance=((np.dot(test_data[i],test_data[i]))-2*(np.dot(test_data[i],X[j]))+(np.dot(X[j],X[j])))**0.5
        if(distance < minimum_distance):
            minimum_distance=distance
            closest_neighbour=Y[j]
    predicted_labels.append(closest_neighbour)
# print(predicted_labels)

# Accuracy score - Nearest Neighbours

In [10]:
accuracy_score(test_labels,predicted_labels)

1.0

# Naive Bayes Classifier

In [11]:
# Assuming data is fitted to a Gaussian
def probability(mean, std, x):
    exponential=np.exp(-1*(x-mean)**2/(2*(std**2)))
    return ((1/(std*((22/7.0)**0.5)))*(exponential))

In [12]:
# Fitting Gausian
def gaussian_parameters(X):
    mean=np.mean(X,axis=0)
    std=np.std(X,axis=0)
    return (mean,std)

The following code is to get data points corresponding to each class

In [13]:
data_class1= [X[i] for i in range(len(Y)) if Y[i]==1] # class1 refers to data corresponding to flower Iris-Virginica
data_class2= [X[i] for i in range(len(Y)) if Y[i]==0] # class2 refers to data does not corresponds to flower Iris-Virginica  
# print(data_class1)
# print(data_class2)

In [14]:
(mean_class1,std_class1)=gaussian_parameters(data_class1) # get each features gaussian parameters if their class is class1
(mean_class2,std_class2)=gaussian_parameters(data_class2) # get each features gaussian parameters if their class is class2
print(mean_class1,std_class1)
print(mean_class2,std_class2)
total_class1=0
for i in range(len(Y)):
    if(Y[i]==1):
        total_class1=total_class1+1
class1_probability=float(total_class1)/len(Y)
class2_probability=1-class1_probability
# print(class1_probability)
# print(class2_probability)

[ 5.52051282  2.        ] [ 0.53454005  0.28644595]
[ 2.86419753  0.78888889] [ 1.4404482   0.56371782]


In [15]:
predicted_labels=[]
for i in range(len(test_data)):
    probability_class1=1
    probability_class2=1
    for j in range(len(test_data[i])):
        probability_class1=probability_class1*probability(mean_class1[j],std_class1[j],test_data[i][j])
        probability_class2=probability_class2*probability(mean_class2[j],std_class2[j],test_data[i][j])
    probability_class1=probability_class1*class1_probability
    probability_class2=probability_class2*class2_probability
#     print(probability_class1,probability_class2)
    if(probability_class1>probability_class2):
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)

# Accuracy score - Naive Bayes Classifier

In [16]:
accuracy_score(test_labels,predicted_labels)
# print(predicted_labels)

0.96666666666666667

In [17]:
# from sklearn.naive_bayes import GaussianNB
# gnb = GaussianNB()
# gnb.fit(X, Y)
 
# # making predictions on the testing set
# y_pred = gnb.predict(test_data)
# print(y_pred)
 
# # comparing actual response values (y_test) with predicted response values (y_pred)
# from sklearn import metrics
# print("Gaussian Naive Bayes model accuracy(in %):", metrics.accuracy_score(test_labels, y_pred)*100)

# Logistic Regression - Gradient Descent

Create a copy of features of test data and insert value "1" as first feature in every data point of test_data

In [18]:
X_data=np.copy(X)
X_data=np.insert(X_data, 0, values=[1], axis=1)

In [19]:
def sigmoid(z):
    return 1.0/(1+np.exp(-1*z))
def gradient_descent_logistic_regression(X_data,Y,learning_rate,number_iterations):
    theta=np.zeros(X_data.shape[1])
    for i in range(number_iterations):
        z=np.dot(X_data,theta)
        p=sigmoid(z)
        gradient=np.dot(X_data.T, (p - Y)) / Y.size
#         print(theta)
#         print(gradient)
        theta=theta-learning_rate*gradient
    return theta

In [20]:
# print(X_data.shape)
learning_rate=0.1
number_iterations=300000
theta=gradient_descent_logistic_regression(X_data,Y,learning_rate,number_iterations)
print(theta)

[-38.18213454   4.80744687   8.97914439]


In [21]:
test_data_new=np.copy(test_data)
test_data_new=np.insert(test_data_new, 0, values=[1], axis=1);
predicted_labels=[]
for i in range(len(test_data_new)):
    if(sigmoid(np.dot(test_data_new[i],theta))>0.5):
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)
# print(predicted_labels)

# Accuracy score - Logistic Regression (Gradient Descent)

In [22]:
accuracy_score(test_labels,predicted_labels)

1.0

# Logistic Regression - Newton's method

In [23]:
def newton_method_logistic_regression(X_data,Y,number_iterations):
    theta=np.zeros(X_data.shape[1])
    for i in range(number_iterations):
        z=np.dot(X_data,theta)
        p=sigmoid(z)
        gradient=np.dot(X_data.T, (p - Y)) / Y.size
        learning_rate=np.linalg.inv(np.dot(X_data.T,X_data)/ Y.size)
#         print(theta)
#         print(gradient)
        theta=theta-np.dot(learning_rate,gradient)
    return theta

In [24]:
theta=newton_method_logistic_regression(X_data,Y,number_iterations)
print(theta)

[-41.57478717   5.31187227   9.53893649]


In [25]:
predicted_labels=[]
for i in range(len(test_data_new)):
    if(sigmoid(np.dot(test_data_new[i],theta))>0.5):
        predicted_labels.append(1)
    else:
        predicted_labels.append(0)

# Accuracy score - Logistic Regression (Newton's method)

In [26]:
accuracy_score(test_labels,predicted_labels)

1.0

# Logistic Regression (Library)

In [27]:
logistic_regression=LogisticRegression()
logistic_regression.fit(X,Y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [28]:
logistic_regression.predict_proba([[5,2]])

array([[ 0.29438456,  0.70561544]])

In [29]:
predicted_labels=logistic_regression.predict(test_data) # prediction of labels for test data

# Accuracy score - Logistic Regression (library)

In [30]:
accuracy_score(test_labels,predicted_labels)

1.0