In [1]:
import scipy.io
import math
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

Importing input file

In [2]:
data = scipy.io.loadmat('fashion_mnist.mat') 
Xtrain = data['trX']
Ytrain = data['trY']
xtrain_df = pd.DataFrame(Xtrain)

Feature extraction for training data

In [3]:
x1_data=np.mean(xtrain_df,axis=1)
x1_data.name='mean'
x2_data=np.std(xtrain_df,axis=1)
x2_data.name='std'
feature_data=pd.merge(x1_data,x2_data,right_index = True, left_index = True)
feature_data

Unnamed: 0,mean,std
0,0.423159,0.395417
1,0.143367,0.194897
2,0.306057,0.407228
3,0.347949,0.332871
4,0.369848,0.374355
...,...,...
11995,0.168517,0.255543
11996,0.328006,0.362793
11997,0.179372,0.274561
11998,0.189001,0.325558


Adding corresponding class labels for train data

In [4]:
labels=pd.DataFrame(Ytrain)
feature_data['label']=pd.DataFrame(labels.T)
feature_data

Unnamed: 0,mean,std,label
0,0.423159,0.395417,0.0
1,0.143367,0.194897,0.0
2,0.306057,0.407228,0.0
3,0.347949,0.332871,0.0
4,0.369848,0.374355,0.0
...,...,...,...
11995,0.168517,0.255543,1.0
11996,0.328006,0.362793,1.0
11997,0.179372,0.274561,1.0
11998,0.189001,0.325558,1.0


Finding mean for each class

In [5]:
class0=feature_data.loc[feature_data['label']==0]
class1=feature_data.loc[feature_data['label']==1]
class0_means=[0,0]
class1_means=[0,0]
class0_means[0]=class0['mean'].mean()
class0_means[1]=class0['std'].mean()
class0_means
class1_means[0]=class1['mean'].mean()
class1_means[1]=class1['std'].mean()
class1_means


[0.22290531462584984, 0.33394171202721934]

Finding variance for each class

In [6]:
class0_var=[0,0]
class1_var=[0,0]
class0_var[0]=class0['mean'].var()
class0_var[1]=class0['std'].var()
class0_var
class1_var[0]=class1['mean'].var()
class1_var[1]=class1['std'].var()
class1_var

[0.003243958057139923, 0.0032532239122850352]

Setting prior value

In [7]:
prior=0.5

PDF function

In [8]:
def pdf(x , mean,var):
    return (1.0 / (math.sqrt(2*math.pi*var)) * math.exp(-0.5*((x - mean)** 2 / var) ))

Applying PDF function to predict label for train data

In [15]:
pred_df = pd.DataFrame(columns = ['Pred_label'])
for i in range(feature_data.shape[0]):
    class0_prob = pdf(feature_data['mean'][i],class0_means[0],class0_var[0])*pdf(feature_data['std'][i],class0_means[1],class0_var[1])*prior
    class1_prob = pdf(feature_data['mean'][i],class1_means[0],class1_var[0])*pdf(feature_data['std'][i],class1_means[1],class1_var[1])*prior
    
    if class0_prob > class1_prob:
        pred_df.loc[i, 'Pred_label'] = 0.0
    else:
        pred_df.loc[i, 'Pred_label'] = 1.0

In [13]:
pred_df['Pred_label']= pred_df.Pred_label.astype(float)

Checking accuracy on train data

In [14]:
accuracy_score(feature_data['label'], pred_df['Pred_label'])


0.8238333333333333

Transforming Test data

In [16]:
Xtest = data['tsX']
Ytest = data['tsY']
xtest_df = pd.DataFrame(Xtest)
x1_test=np.mean(xtest_df,axis=1)
x1_test.name='mean'
x2_test=np.std(xtest_df,axis=1)
x2_test.name='std'
test_data=pd.merge(x1_test,x2_test,right_index = True, left_index = True)
test_labels=pd.DataFrame(Ytest)
test_data['label']=pd.DataFrame(test_labels.T)
test_data


Unnamed: 0,mean,std,label
0,0.225665,0.222569,0.0
1,0.114816,0.221261,1.0
2,0.164591,0.263742,1.0
3,0.358593,0.424441,1.0
4,0.190886,0.293430,1.0
...,...,...,...
1995,0.096439,0.170104,1.0
1996,0.233683,0.383631,1.0
1997,0.217972,0.341390,1.0
1998,0.175030,0.166916,0.0


Applying naive bayes on test data

In [17]:
testpred_df = pd.DataFrame(columns = ['Pred_label'])
for i in range(test_data.shape[0]):
    class0_prob = pdf(test_data['mean'][i],class0_means[0],class0_var[0])*pdf(test_data['std'][i],class0_means[1],class0_var[1])*prior
    class1_prob = pdf(test_data['mean'][i],class1_means[0],class1_var[0])*pdf(test_data['std'][i],class1_means[1],class1_var[1])*prior
    
    if class0_prob > class1_prob:
        testpred_df.loc[i, 'Pred_label'] = 0.0
    else:
        testpred_df.loc[i, 'Pred_label'] = 1.0


In [18]:
testpred_df['Pred_label']= testpred_df.Pred_label.astype(float)

Checking accuracy on test data

In [19]:
accuracy_score(test_data['label'], testpred_df['Pred_label'])

0.8315

Confusion matrix on test data

In [20]:
confusion_matrix(test_data['label'], testpred_df['Pred_label'])

array([[784, 216],
       [121, 879]])

Logistic Regression implementation


In [26]:
def train( Xdata, y, lr, itr ):
  #intializing weights and bias
    wt = np.zeros(Xdata.shape[1])
    bias = 0

    for i in range(itr):
      
      #sigmoid calculation
      ypred = np.dot(wt,Xdata.T) + bias
      ypred = 1/(1 + np.exp(-ypred))

      pred_error = y-ypred
      dl = np.average(pred_error*Xdata.T,1)

      #updating weights
      wt = wt + dl*lr

      #cost function
      loglikelihood = np.sum(y * np.log(ypred) + (1 - y) * np.log(1 - ypred))
      
      #bias update
      bias  = bias + np.average(pred_error)

    return wt,bias 

Prediction function

In [22]:
def predict(Xs, w, bias):

  ypred = np.dot(w,Xs.T) + bias

  #greater than 0.5 label it to 1, else label it to 0
  ypred[np.where(ypred>=0.5)]=1
  ypred[np.where(ypred<0.5)]=0

  return ypred

calculating accuracy for logistic regression

In [23]:
def accuracy(ypred, ygiven):
  count=0
  x = ypred == ygiven
  accuracy = np.sum(x)/ypred.shape[0]
  accuracy= accuracy*100    
  return accuracy

Preparing test and training data for logistic regression

In [24]:
xtrain_wl = feature_data[['mean','std']]
xtest_wl = test_data[['mean','std']]

Calculating Accuracy using Logistic regression

In [56]:
wt,bias = train(xtrain_wl.to_numpy(), labels.to_numpy(), 3.5, 10000)
pred_classes = predict(xtest_wl.to_numpy(),wt,bias)
print(accuracy(pred_classes,test_labels.to_numpy() ))

92.25
