In [3]:
import pandas as pd
import numpy as np
from pprint import pprint

In [4]:
col_names=['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol','quality']
data = pd.read_csv("winequality-red.csv",sep=';')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
data.loc[data["quality"] < 5,"quality"]=0
data.loc[data["quality"] == 5,"quality"]=1
data.loc[data["quality"] == 6,"quality"]=1
data.loc[data["quality"] >6,"quality"]=2
data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,1
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,1
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,1
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,1


In [6]:
def normalize(x):
    new_x=(x-np.mean(x))/np.std(x)
    return new_x

In [7]:
data.iloc[:,0:11]=data.iloc[:,0:11].apply(normalize)
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,1
1,-0.298547,1.967442,-1.391472,0.043416,0.223875,0.872638,0.624363,0.028261,-0.719933,0.12895,-0.584777,1
2,-0.298547,1.297065,-1.18607,-0.169427,0.096353,-0.083669,0.229047,0.134264,-0.331177,-0.048089,-0.584777,1
3,1.654856,-1.384443,1.484154,-0.453218,-0.26496,0.107592,0.4115,0.664277,-0.979104,-0.46118,-0.584777,1
4,-0.52836,0.961877,-1.391472,-0.453218,-0.243707,-0.466193,-0.379133,0.558274,1.288643,-0.579207,-0.960246,1


In [8]:
bin_labels_4=[0,1,2,3]
for i in range(0,11):
    data.iloc[:,i]=pd.qcut(data.iloc[:,i],q=4,labels=bin_labels_4)
data.head()


Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,1,3,0,0,1,1,1,2,3,1,0,1
1,1,3,0,2,3,3,3,2,0,2,1,1
2,1,3,0,2,3,2,2,2,1,2,1,1
3,3,0,3,0,1,2,2,3,0,1,1,1
4,1,3,0,0,1,1,1,2,3,1,0,1


In [9]:
traincount=int(data.shape[0]*0.8)
traincount

1279

In [10]:
def train_test_split(data):
    training_data=data.iloc[:traincount].reset_index(drop=True)
    testing_data=data.iloc[traincount:].reset_index(drop=True)
    return training_data,testing_data
training_data=train_test_split(data)[0]
testing_data=train_test_split(data)[1]
testing_data.shape

(320, 12)

In [11]:
training_data.shape

(1279, 12)

In [12]:
#Entropy calculation
def entropy(class_label):
    values,counts=np.unique(class_label,return_counts=True)
 
    for i in range(len(values)):
        entropy=np.sum([(-counts[i]/np.sum(counts))*np.log2(counts[i]/np.sum(counts))])
    return entropy

In [13]:
#info gain
def InfoGain(data,split_attribute_name,class_label="equality"):
    total_entropy=entropy(data[class_label])
    vals,counts=np.unique(data[split_attribute_name],return_counts=True)
    #Calculate the weighted entropy
    for i in range(len(vals)):
        Weighted_Entropy=np.sum([(counts[i]/np.sum(counts))*entropy(data.where(data[split_attribute_name]==vals[i]).dropna()[class_label])])
 
    #formula for information gain
    Information_Gain=total_entropy-Weighted_Entropy
    return Information_Gain


In [14]:
def ID3(data,originaldata,features,class_label="quality",parent_node_class=None):
    #if all class_label values are same, return that value
    if len(np.unique(data[class_label]))<=1:
        return np.unique(data[class_label])[0]
    
    #if the dataset is empty or below some threshold value,terminate recursion
    elif len(data)==0:
        #find the counts of distinct values of class_label, then find the maximum count of them--> majority class label
        return np.unique(originaldata[class_label])[np.argmax(np.unique(originaldata[class_label],return_counts=True)[1])]
    
    #if the feature space is empty,terminate recursion
    elif len(features)==0:
        return parent_node_class
    
    #If none of the above condition holds true form the subtrees
    
    else:
        #Find the counts of distinct values of class_label, then find the maximum count of them-->majority class label
        parent_node_class=np.unique(data[class_label])[np.argmax(np.unique(data[class_label],return_counts=True)[1])]
        
    #Select the feature which best splits the dataset, feature having maximum information gain
    
    for feature in features:
        item_values=[InfoGain(data,feature,class_label)] #Return the infogain values
    best_feature_index=np.argmax(item_values)
    best_feature=features[best_feature_index]
    
    #Create the tree structure as a nested dictionary
    tree={best_feature:{}}
    
    #Remove the feature with the best info gain
    features=[i for i in features if i!=best_feature]
    
    #Form subtrees down the root node by calling ID3 recursively
    
    for value in np.unique(data[best_feature]):
        value=value
        sub_data=data.where(data[best_feature]==value).dropna()
        #call the ID3 algorthm
        subtree=ID3(sub_data,data,features,class_label,parent_node_class)
        #Add the subtree
        tree[best_feature][value]=subtree
    return(tree)

In [15]:

tree = ID3(training_data,training_data,training_data.columns[:-1])
pprint(tree)

{'fixed acidity': {0: {'volatile acidity': {0: {'citric acid': {0: 1.0,
                                                                1: {'residual sugar': {0: 1.0,
                                                                                       1: {'chlorides': {0: 1.0,
                                                                                                         3: {'free sulfur dioxide': {0: 0.0,
                                                                                                                                     2: 1.0}}}},
                                                                                       2: 1.0,
                                                                                       3: 1.0}},
                                                                2: {'residual sugar': {0: {'chlorides': {0: {'free sulfur dioxide': {0: 2.0,
                                                                                                     

                                                                                       3: {'chlorides': {2: {'free sulfur dioxide': {0: 1.0,
                                                                                                                                     1: 2.0}},
                                                                                                         3: 1.0}}}}}},
                                            1: {'citric acid': {0: {'residual sugar': {0: 1.0,
                                                                                       2: 2.0}},
                                                                1: 1.0,
                                                                2: {'residual sugar': {0: {'chlorides': {0: {'free sulfur dioxide': {0: 2.0,
                                                                                                                                     1: 1.0,
                                                   

                                                                                                         3: {'free sulfur dioxide': {0: 1.0,
                                                                                                                                     1: {'total sulfur dioxide': {1: 1.0,
                                                                                                                                                                  2: 2.0}},
                                                                                                                                     2: 1.0}}}},
                                                                                       3: 1.0}},
                                                                3: {'residual sugar': {0: {'chlorides': {0: 1.0,
                                                                                                         1: {'free sulfur dioxide': {0: 1.0,
                     

In [16]:
#Predict the Result
def predict(query,tree,default=1):
    for key in list(query.keys()):
        if key in list(tree.keys()):
            try:
                result=tree[key][query[key]]
            except:
                return default
            result = tree[key][query[key]]
 
            if isinstance(result,dict):
               return predict(query,result)
            else:
                return result

In [17]:
def test(data,tree):
 
    queries = data.iloc[:,:-1].to_dict(orient = "records")
 
    predicted = pd.DataFrame(columns=["predicted"])
    
    for i in range(len(data)):
        predicted.loc[i,"predicted"] = predict(queries[i],tree,1.0) 
    print('The prediction accuracy is: ',(np.sum(predicted["predicted"] == data["quality"])/len(data))*100,'%')
    



In [18]:
test(testing_data,tree)


The prediction accuracy is:  83.75 %


In [19]:
from sklearn.tree import DecisionTreeClassifier


In [20]:
train_features = data.iloc[:traincount,:-1]
test_features = data.iloc[traincount:,:-1]
train_targets = data.iloc[:traincount,-1]
test_targets = data.iloc[traincount:,-1]


In [21]:
tree = DecisionTreeClassifier(criterion = 'entropy').fit(train_features,train_targets)

In [22]:
prediction = tree.predict(test_features)
prediction


array([2, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0,
       1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 2, 1, 1, 1, 1, 2,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 2, 0, 2, 0, 1, 1, 1, 1, 1, 2, 1,
       2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 1, 1, 2, 0,
       2, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 1, 2,
       1, 2, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 0, 1,

In [23]:
print("The prediction accuracy is: ",tree.score(test_features,test_targets)*100,"%")


The prediction accuracy is:  75.3125 %


In [24]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report



In [25]:
x = data.drop('quality',axis=1)
y = data['quality']

In [26]:
x_train,x_test,y_train,y_test = train_test_split(x,y, test_size=0.20)

In [27]:
classifier=DecisionTreeClassifier()
classifier.fit(x_train,y_train)

DecisionTreeClassifier()

In [28]:
y_pred=classifier.predict(x_test)

In [31]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.16      0.14      0.15        21
           1       0.85      0.85      0.85       255
           2       0.49      0.50      0.49        44

    accuracy                           0.76       320
   macro avg       0.50      0.50      0.50       320
weighted avg       0.76      0.76      0.76       320



In [32]:
target=list(data['quality'].unique())
feature_names=list(x.columns)
from sklearn.tree import export_text
r=export_text(classifier,feature_names=feature_names)
print(r)

|--- alcohol <= 2.50
|   |--- volatile acidity <= 0.50
|   |   |--- sulphates <= 1.50
|   |   |   |--- chlorides <= 0.50
|   |   |   |   |--- residual sugar <= 0.50
|   |   |   |   |   |--- pH <= 1.50
|   |   |   |   |   |   |--- sulphates <= 0.50
|   |   |   |   |   |   |   |--- class: 1
|   |   |   |   |   |   |--- sulphates >  0.50
|   |   |   |   |   |   |   |--- class: 2
|   |   |   |   |   |--- pH >  1.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |   |--- residual sugar >  0.50
|   |   |   |   |   |--- residual sugar <= 1.50
|   |   |   |   |   |   |--- class: 0
|   |   |   |   |   |--- residual sugar >  1.50
|   |   |   |   |   |   |--- class: 1
|   |   |   |--- chlorides >  0.50
|   |   |   |   |--- class: 1
|   |   |--- sulphates >  1.50
|   |   |   |--- alcohol <= 1.50
|   |   |   |   |--- citric acid <= 0.50
|   |   |   |   |   |--- class: 2
|   |   |   |   |--- citric acid >  0.50
|   |   |   |   |   |--- chlorides <= 1.50
|   |   |   |   |   |   |--- chlorides <= 0