In [26]:
#Flow:
#DataSet->Noramlise->Oversample(two techniques)->train,val,test Split-> train model ->Test accuracy(individual model) -> Combine the models and test accuracy

In [27]:
#Create a Multi Class Classifier using Linear regression.
#One-vs-Rest strategy

In [28]:
import numpy as np
import pandas as pd

In [29]:
path="Iris.csv"

In [30]:
df=pd.read_csv(path)
df.drop('Id',axis=1)

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [31]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             150 non-null    int64  
 1   SepalLengthCm  150 non-null    float64
 2   SepalWidthCm   150 non-null    float64
 3   PetalLengthCm  150 non-null    float64
 4   PetalWidthCm   150 non-null    float64
 5   Species        150 non-null    object 
dtypes: float64(4), int64(1), object(1)
memory usage: 7.2+ KB


In [32]:
#Standard  Scaler
def standard_scaler(df):
    for c in df.columns:
        if df[c].dtype != 'O':
            mean=df[c].mean()
            std=df[c].std()
            if mean!=std:
                df[c]=(df[c]-mean)/(std)
    return df

In [33]:
df=standard_scaler(df)

In [34]:
#Creating new Datasets
classes = [x for x in set(df['Species'])]
for i in range(len(classes)):
    file_name = f'iris_class_{i}.csv'
    df_copy = df.copy()
    df_copy['Species'] = np.where(df_copy['Species'] == classes[i], 1, 0)
    df_copy.to_csv(file_name, index=False)


In [35]:
#Split the Total Rows into train,val,test
def get_split(n):
    train=int(0.8*n)
    val=int(0.1*n)+train
    return train,val

In [36]:
#Split The Data into Train,Validate,Test Data
# 80,10,10 Percentages
def create_split(df):
    df_positive_samples=df[df['Species']==1].sample(frac=1)
    df_negative_samples=df[df['Species']==0].sample(frac=1)
    pos_samples=len(df_positive_samples)
    neg_samples=len(df_negative_samples)
    ptrain,pval=get_split(pos_samples)
    ntrain,nval=get_split(neg_samples)
    df_pos_train=df_positive_samples.iloc[:ptrain]
    df_pos_val=df_positive_samples.iloc[ptrain:pval]
    df_pos_test=df_positive_samples.iloc[pval:]
    df_neg_train=df_negative_samples.iloc[:ntrain]
    df_neg_val=df_negative_samples.iloc[ntrain:nval]
    df_neg_test=df_negative_samples.iloc[nval:]
    df_train=pd.concat([df_pos_train,df_neg_train],ignore_index=True)
    df_val=pd.concat([df_pos_val,df_neg_val],ignore_index=True)
    df_test=pd.concat([df_pos_test,df_neg_test],ignore_index=True)
    return df_train,df_val,df_test

In [37]:
# Two Methods to Duplicate the DataPoints
# 1 Replicate the Data
# 2 SMOTE (Synthetic Minority Over-Sampling Technique)

def replicate_data(df):
    df_samples=df[df['Species']==1]
    df=pd.concat([df,df_samples],ignore_index=True)
    return df

In [38]:
#SMOTE -> Synthetic Minority Over-Sampling Technique
# Take Two points (x1,x2)
#New point xnew=l*x1+(1-l)*x2(Where 0<l<1)
# There are many Strategies from which you can choose x2 for a point x1(Lets Say the Nearest Point to X1)
#Consider the strategy of Nearest point as x2 and random probability lambda(Fixed lambda can create duplicate points if x1 and x2 are mutually near to each other)

#How to find the nearest point ? Find the Euclidean distance betweeen each point and find which point has the smallest Euclidean distance(L2-Norm)

#Time Complexity ->O(n**2)
def find_nearest(idx,df):
  dist=1e100
  ans_idx=-1
  for i in range(len(df)):
    if i == idx:
      continue
    d=0
    for c in df.columns:
      d+=(df.iloc[i][c]-df.iloc[idx][c])**2
    d=d**(0.5)
    if dist>d:
      d=dist
      ans_idx=i
  
  return ans_idx
    

In [39]:
import random as rd

In [40]:
#Find the Synthetic point for SMOTE
def syn_data(idx,df):
  ans_idx=find_nearest(idx,df)
  lam=rd.uniform(0,1)
  new_data_point={}
  for c in df.columns:
    new_data_point[c]=lam*df.iloc[idx][c]+(1-lam)*(df.iloc[ans_idx][c])

  return pd.Series(new_data_point)


In [41]:
#SMOTE
def SMOTE(df):
  df_minority_samples=df[df['Species']==1]
  df_copy=df.copy()
  for i in range(len(df_minority_samples)):
    new_point=syn_data(i,df_minority_samples)
    df_copy=pd.concat([df_copy,new_point.to_frame().T],ignore_index=True)
  return df_copy

In [42]:
#Linear Regression Function
from regression import linear_regression

In [43]:

#Replication
models_replication=[]
for i in range(3):
  path=f"iris_class_{i}.csv"
  data=pd.read_csv(path)
  data=replicate_data(data)
  train,val,test=create_split(data)
  train_x = train.iloc[:,:-1].to_numpy() 
  train_y = train.iloc[:,-1].to_numpy()
  val_x   = val.iloc[:,:-1].to_numpy()
  val_y   = val.iloc[:,-1].to_numpy()
  test_x  = test.iloc[:,:-1].to_numpy()
  test_y  = test.iloc[:,-1].to_numpy()
  wt=linear_regression(train_x,train_y,val_x,val_y)
  models_replication.append(wt)


In [45]:

#SMOTE
models_smote=[]
for i in range(3):
  path=f"iris_class_{i}.csv"
  data=pd.read_csv(path)
  data=SMOTE(data)
  train,val,test=create_split(data)
  train_x = train.iloc[:,:-1].to_numpy() 
  train_y = train.iloc[:,-1].to_numpy()
  val_x   = val.iloc[:,:-1].to_numpy()
  val_y   = val.iloc[:,-1].to_numpy()
  test_x  = test.iloc[:,:-1].to_numpy()
  test_y  = test.iloc[:,-1].to_numpy()
  wt=linear_regression(train_x,train_y,val_x,val_y)
  models_smote.append(wt)


In [46]:
# Accuracy of Data set
y_lables=[]
df=pd.read_csv("Iris.csv")
for data in df['Species']:
  for i in range(3):
    if data==classes[i]:
      y_lables.append(i)
      break


In [47]:
df=standard_scaler(df)
x=np.array(df.drop('Species',axis=1))
x=x.T
x= np.vstack((np.ones((1, x.shape[1])),x))


In [48]:
#Replication
rep_outputs=[]
for i in range(3):
  rep_outputs.append(x.T@models_replication[i])
rep_labels=[]
for i in range(x.shape[1]):
  label=0
  val=rep_outputs[0][i]
  for j in range(1,3):
    if rep_outputs[j][i]>val:
      label=j
      val=rep_outputs[j][i]
  rep_labels.append(label)

In [49]:
n=len(rep_labels)
correct=0
for i in range(n):
  if(rep_labels[i]==y_lables[i]):
    correct+=1
print((correct/n)*100)

88.66666666666667


In [50]:
#SMOTE
smote_outputs=[]
for i in range(3):
  smote_outputs.append(x.T@models_smote[i])
smote_labels=[]
for i in range(x.shape[1]):
  label=0
  val=smote_outputs[0][i]
  for j in range(1,3):
    if smote_outputs[j][i]>val:
      label=j
      val=smote_outputs[j][i]
  smote_labels.append(label)

In [51]:
n=len(smote_labels)
correct=0
for i in range(n):
  if(smote_labels[i]==y_lables[i]):
    correct+=1
print((correct/n)*100)

92.0
