# Bagging

In [3]:
import numpy as np
from sklearn import model_selection
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris

  return f(*args, **kwds)


In [4]:
iris = load_iris()
X = iris.data
Y = iris.target

In [5]:
X_fit, X_eval, Y_fit, Y_test = model_selection.train_test_split(X,Y,test_size = 0.3,random_state=1)

In [6]:
seed = 7
kfold = model_selection.KFold(n_splits=5,random_state=7)
kfold

KFold(n_splits=5, random_state=7, shuffle=False)

In [7]:
cart = DecisionTreeClassifier()
num_trees = 100

In [8]:
model = BaggingClassifier(base_estimator=cart, n_estimators=num_trees,random_state=seed)
model

BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=1.0, n_estimators=100, n_jobs=1, oob_score=False,
         random_state=7, verbose=0, warm_start=False)

In [9]:
results = model_selection.cross_val_score(model, X_fit, Y_fit, cv=kfold)
results

array([1.        , 0.95238095, 1.        , 0.9047619 , 0.85714286])

In [10]:
x = [print("model ",i," accuracy ",results[i]) for i in range(0,len(results))]

model  0  accuracy  1.0
model  1  accuracy  0.9523809523809523
model  2  accuracy  1.0
model  3  accuracy  0.9047619047619048
model  4  accuracy  0.8571428571428571


In [11]:
print("Average accuracy ",results.mean())

Average accuracy  0.9428571428571428


# Boosting

In [12]:
from sklearn.ensemble import AdaBoostClassifier

In [13]:
iris = load_iris()
X = iris.data
Y = iris.target

In [15]:
X_fit, X_eval, Y_fit, Y_test = model_selection.train_test_split(X,Y,test_size = 0.20, random_state=1)

In [16]:
cart = DecisionTreeClassifier()
num_trees = 25

In [17]:
model = AdaBoostClassifier(base_estimator=cart, n_estimators=num_trees, learning_rate=0.1)
model

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best'),
          learning_rate=0.1, n_estimators=25, random_state=None)

In [30]:
model.staged_score(X_fit,Y_fit)

<generator object BaseWeightBoosting.staged_score at 0x107c5c620>

In [34]:
pred_label = model.predict(X_eval)
nnz = np.float(np.shape(Y_test)[0] - np.count_nonzero(pred_label - Y_test))
acc = 100*nnz/np.shape(Y_test)[0]

In [36]:
acc

96.66666666666667

# Stacking

In [37]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier

In [40]:
iris = load_iris()

In [42]:
X,y = iris.data[:,1:3], iris.target

In [43]:
def CalculateAccuracy(y_test,pred_label):
    nnz = np.shape(y_test)[0] - np.count_nonzero(pred_label - y_test)
    acc = 100*nnz/float(np.shape(y_test)[0])
    return acc

In [44]:
clf1 = KNeighborsClassifier(n_neighbors=2)
clf2 = RandomForestClassifier(n_estimators=2, random_state=1)
clf3 = GaussianNB()
lr = LogisticRegression()

In [45]:
clf1.fit(X,y)
clf2.fit(X,y)
clf3.fit(X,y)

GaussianNB(priors=None)

In [51]:
f1 = clf1.predict(X)
acc1 = CalculateAccuracy(y,f1)
acc1

96.66666666666667

In [52]:
f2 = clf2.predict(X)
acc2 = CalculateAccuracy(y,f2)
acc2

94.66666666666667

In [53]:
f3 = clf3.predict(X)
acc3 = CalculateAccuracy(y,f3)
acc3

92.0

In [57]:
f = [f1,f2,f3]
f = np.transpose(f)

In [59]:
lr.fit(f,y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [60]:
final = lr.predict(f)

In [61]:
acc4 = CalculateAccuracy(y,final)

In [62]:
acc4

97.33333333333333

# Decision Trees

In [1]:
dataset = {'Name':['Person 1','Person 2','Person 3','Person 4','Person 5','Person 6','Person 7','Person 8','Person 9','Person 10'],
           'Salary':['Low','Med','Med','Med','Med','High','Low','High','Med','Low'],
           'Sex':['Male','Male','Male','Female','Male','Female','Female','Male','Female','Male'],
           'Marital':['Unmarried','Unmarried','Married','Married','Married','Unmarried','Unmarried','Unmarried','Unmarried','Married'],
           'Class':['No','No','Yes','No','Yes','Yes','No','Yes','Yes','Yes']}

In [2]:
import pandas as pd
df = pd.DataFrame(dataset)

In [3]:
df

Unnamed: 0,Name,Salary,Sex,Marital,Class
0,Person 1,Low,Male,Unmarried,No
1,Person 2,Med,Male,Unmarried,No
2,Person 3,Med,Male,Married,Yes
3,Person 4,Med,Female,Married,No
4,Person 5,Med,Male,Married,Yes
5,Person 6,High,Female,Unmarried,Yes
6,Person 7,Low,Female,Unmarried,No
7,Person 8,High,Male,Unmarried,Yes
8,Person 9,Med,Female,Unmarried,Yes
9,Person 10,Low,Male,Married,Yes


### Entropy of classes
1. Degree of randomness or uncertainity
2. Degree of variance
3. Class variance
4. Measure of impurity (if all instance are same class then entropy is 0, pure dataset)

### Mutual Information
1. Its a measure to select the most informative attribute
2. Select an attribute that is used to reduce entropy

In [7]:
import numpy as np

In [35]:
def getClassEntropy(classAttributes):
    #get disctinct classes and how many time they occur
    _,counts = np.unique(classAttributes, return_counts=True)
    denom = len(classAttributes)
    entropy = 0
    #Loop to calculate entropy for dataset
    for count in counts:
        fraction = count/denom
        entropy += -fraction*np.log2(fraction)
    return entropy

In [40]:
#Entropy for our target variable - Class
for i in list(df.columns):
    print(i,"->",getClassEntropy(df[i]))

Name -> 3.321928094887362
Salary -> 1.4854752972273344
Sex -> 0.9709505944546686
Marital -> 0.9709505944546686
Class -> 0.9709505944546686


### What is Entropy?

What is entropy, fundamentally?

Entropy deals with the number of "microstates" that are consistent with a given "macrostate". For example, if I tell you only macroscopic properties of a container of gas (e.g., volume, pressure, energy, number of molecules, etc.), there are still lots of different ways of arranging the individual molecules (in terms of position, momentum, angular momentum, vibrations, etc.) that would be "allowed". Entropy talks about how many such states there are.

What happens when we talk about the combination of two systems?

Say we have two containers of gas, and we know the macroscopic properties of both. There are N1 possible "microstates" for the first container, and N2 for the second. What are the possible microstates for the system that consists of both containers? Well, for each microstate of one container, all of the microstates of the other are still allowed. We can literally pick one of the allowed microstates for each container, and that combination will be a valid microstate for the whole system. So, the total number of microstates is given by the product:

Ntot=N1⋅N2.

How would we like to be able to talk about combinations of systems?

Clearly, when you include more things in your system, the number of microstates increases. There are lots of things that act like this: total mass, total volume, etc. There's a broad class of these called extensive properties (see: Intensive and extensive properties), which are very convenient to work with. They're convenient, because to consider the combination of two sub-systems, you just have to add the relevant quantities together (like for mass, volume, energy, etc.) Unfortunately, number of microstates doesn't behave that way. As shown above, the relevant quantities need to be multiplied, not added, which is harder to work with.

How do we solve this problem?

It's a basic property of logarithms that

log(ab)=log(a)+log(b),

i.e., it turns multiplication into addition. Applying this to our microstates example, we get that

log(N1N2)=log(N1)+log(N2).

So, if we define entropy to be proportional to the log (in any base) of the number of microstates, then entropy suddenly becomes an extensive property of a system, and thus much easier to work with.

### Why is entropy measured in log?

It's because entropy is a type of information, and the easiest way to measure information is in bits and bytes, rather than by the total number of possible states they can represent.

Entropy is the amount of information contained in the microscopic state of a system which is missing in the approximate representation of that system using macroscopic thermodynamics.

The basic unit of information is the bit, which represents 2 possible states.  If you have n bits, then that information represents 2^n possible states.  For example, a byte is 8 bits, therefore the number of states it represents is 28=256.  That means that a byte can store any number between 0 and 255.  If you are given the total number of states, then you just take the log of that number to get the amount of information, measured in bits:  log2256=8.  

So entropy is defined as the log of the number of total microscopic states corresponding to a particular macro state of thermodynamics.  This is the additional information you'd need to know in order to completely specify the microstate, given knowledge of the macrostate.  

Ok, but then you might ask:  why is information measured with logarithms instead of just by the total number of states?  Mostly because it makes it additive.  It's true that if you really wanted to, you could choose to measure information or entropy by the total number of states (usually called the "multiplicity"),  instead of by the log of the multiplicity.  But then it would be multiplicative instead of additive.  If you have 10 bits and then you obtain another 10 bits of information, then you have 20 bits.  Saying the same thing in terms of multiplicity:  if you have 2^10 = 1024 states and then you add another 1024 independent states then you have 1024*1024 = 1048576 states (2^20) when they are combined.  Multiplicity is multiplicative instead of additative, which means that the numbers you need in order to keep track of it get very large very quickly!  This is really inconvenient, hence why we usually stick with using information/entropy as the unit instead of multiplicity.

The only funny thing you might notice here is that in computer science, information is usually measured in units defined by the log base 2 of the number of states, whereas in physics entropy is usually measured in units defined by the natural log (log base e).  This is purely a difference of convention.  Physicists like to use natural logs because they're used to using them for many other things and they have nice mathematical properties.  But there's a good case to be made that bits (log base 2) is the more natural unit to measure entropy in.  I wouldn't be too surprised if, in the future, it becomes more common for physicists to switch to this convention.  (In some areas of physics, such as quantum computing and quantum information, this new convention has already started being adopted.)

In [52]:
def getHistTable(df, attribute):
    value = df[attribute]
    classes = df['Class']
    classunique = df['Class'].unique()
    valunique = df[attribute].unique()
    temp = np.zeros((len(classunique),len(valunique)))
    histTable = pd.DataFrame(temp, index = classunique, columns = valunique)
    
    for i in range(len(classes)):
        histTable[value[i]][classes[i]]+=1
    return histTable

In [53]:
getHistTable(df,'Salary')

Unnamed: 0,Low,Med,High
No,2.0,2.0,0.0
Yes,1.0,3.0,2.0
