In [1]:
import numpy as np
from pandas import read_csv
from scipy.stats import f
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import metrics

In [3]:
data=np.array(read_csv("/content/drive/MyDrive/Colab Notebooks/Data Modelling Lab II/winequality-red.csv"))
X=data[:,:11]
Y=data[:,11]
alpha=0.05
k=8

In [5]:
def anova(X,Y):
  labels=list(set(Y))
  n_c=len(labels)
  N,n=X.shape
  f_score=np.zeros(n)
  for k in range(n):
    feature=X[:,k]
    SSB,SSE=0,0
    grand_mean=sum(feature)/N
    for idx,c in enumerate(labels):
      samples=[feature[i] for i in range(N) if Y[i]==c]
      n_samples=len(samples)
      sample_mean=sum(samples)/n_samples
      SSB+=n_samples*(sample_mean-grand_mean)**2
      SSE+=sum([(samples[p]-sample_mean)**2 for p in range(n_samples)])
    f_score[k]=(SSB*(N-n_c))/((n_c-1)*SSE)
  return f_score,n_c-1,N-n_c

def k_best(X,f_score,k,dof_Nr,dof_Dr,alpha=0.05):
  index=np.argsort(-f_score)
  red_ind=[]
  f_critical=f.ppf(1-alpha,dof_Nr,dof_Dr)
  for idx in range(k):
    if f_score[index[idx]]>f_critical:
      red_ind.append(index[idx])
    else:
      print("Not enough features")
      break
  red_ind=np.sort(red_ind)
  X_reduced=X[:,[red_ind]].reshape(X.shape[0],len(red_ind))
  print("The f-critical value is: ",f_critical)
  print("The f-score for the sysytem is: ",f_score)
  return X_reduced

In [6]:
f_score,dof_Nr,dof_Dr=anova(X,Y)
X_reduced=k_best(X,f_score,k,dof_Nr,dof_Dr,alpha)

The f-critical value is:  2.2197153772719145
The f-score for the sysytem is:  [  6.28308116  60.91399283  19.69066447   1.05337358   6.03563859
   4.7542331   25.47850952  13.3963569    4.3417643   22.27337609
 115.85479747]


In [19]:
x_train,x_test,y_train,y_test=train_test_split(X,Y,test_size=0.3)
x_train_red,x_test_red,y_train_red,y_test_red=train_test_split(X_reduced,Y,test_size=0.3)

In [20]:
clf=DecisionTreeClassifier()
clf.fit(x_train,y_train)
y_pred=clf.predict(x_test)
print("Accuracy with original dataset: ",metrics.accuracy_score(y_test,y_pred))

Accuracy with original dataset:  0.6104166666666667


In [21]:
clf_red=DecisionTreeClassifier()
clf_red.fit(x_train_red,y_train_red)
y_pred_red=clf_red.predict(x_test_red)
print("Accuracy with reduced dataset: ",metrics.accuracy_score(y_test_red,y_pred_red))

Accuracy with reduced dataset:  0.5958333333333333
