In [1]:
import numpy as np
import pandas as pd
import sklearn
import matplotlib.pyplot as plt
import os
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import mean_squared_error

In [2]:
# Load data
SD1 = pd.read_csv("SD1.txt", sep="\t", header = 0)
SD2 = pd.read_csv("SD2.txt", sep="\t", header = 0)

# Clean up- here remove some unwanted whitespaces
SD1['Gene'] = SD1['Gene'].replace(' ','')
SD2['Gene'] = SD2['Gene'].replace(' ','')

In [3]:
def makeClass(a):
    A = a['ESC_RPKM']
    B = a['EB_RPKM']
    y = []
    for i in range(len(A)):
        if float(A[i]) < 10 and float(B[i]) < 10:  
            y.append('N')
        elif float(A[i]) > float(B[i]) * 2:
            y.append('E')
        elif float(A[i]) * 2 < float(B[i]):
            y.append('D')
        else :
            y.append('X')
    return y

In [4]:
y = pd.DataFrame(makeClass(SD1))
SD2['y'] = y
SD2_table = SD2[SD2['y'] != 'N']
for i in SD2_table.columns[2:-2]:
    SD2_table[pd.to_numeric(SD2_table[i], errors='coerce').notnull()]


In [5]:
x = SD2_table.drop("y", axis=1).iloc[:,2:].astype(float)
x = x[x.applymap(np.isreal)]
y = SD2_table["y"]
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=0)

# Set a learning rate
l_rate = 0.01

# Initialise a Boosting model using sklearn's boosting model 
# Use 1000 estimators, depth of 1 and learning rate as defined above
boosted_model  = GradientBoostingClassifier(n_estimators=1000, max_depth=1, learning_rate = l_rate)

# Fit on the train data
boosted_model.fit(x_train, y_train)

# Predict on the test data
y_pred = boosted_model.predict(x_test)


In [6]:
# Specify the number of bootstraps
num_bootstraps = 30

# Specify the maximum depth of the decision tree
max_depth = 100

# Define the Bagging Model
# Use Decision Tree as your base estimator with depth as mentioned in max_depth
# Initialise number of estimators using the num_bootstraps value
# Set max_samples as 1 and random_state as 3
model = BaggingClassifier(base_estimator=tree.DecisionTreeClassifier(), n_estimators=num_bootstraps, max_samples=1, random_state=3)
                        

# Fit the model on the train data
model.fit(x_train, y_train)


BaggingClassifier(base_estimator=DecisionTreeClassifier(), max_samples=1,
                  n_estimators=30, random_state=3)

In [9]:
boosted_model.score(x_test, y_test)
model.score(x_test, y_test)

0.6593285635663181

In [178]:
from sklearn.ensemble import AdaBoostClassifier
from matplotlib.colors import ListedColormap
from sklearn.tree import DecisionTreeClassifier
import seaborn as sns
%matplotlib inline
sns.set_style('white')

In [180]:
def plot_decision_boundary(classifier, X, y, N = 10, scatter_weights = np.ones(len(y)) , ax = None,counter=0,label=False):
    '''Utility function to plot decision boundary and scatter plot of data'''
    x_min, x_max = X[:, 0].min() - .1, X[:, 0].max() + .1
    y_min, y_max = X[:, 1].min() - .1, X[:, 1].max() + .1
    xx, yy = np.meshgrid( np.linspace(x_min, x_max, N), np.linspace(y_min, y_max, N))
    cmap = ListedColormap(["#ABCCE3","#50AEA4"])
    
    #Check what methods are available
    if hasattr(classifier, "decision_function"):
        zz = np.array( [classifier.decision_function(np.array([xi,yi]).reshape(1,-1)) for  xi, yi in zip(np.ravel(xx), np.ravel(yy)) ] )
    elif hasattr(classifier, "predict_proba"):
        zz = np.array( [classifier.predict_proba(np.array([xi,yi]).reshape(1,-1))[:,1] for  xi, yi in zip(np.ravel(xx), np.ravel(yy)) ] )
    else :
        zz = np.array( [classifier(np.array([xi,yi]).reshape(1,-1)) for  xi, yi in zip(np.ravel(xx), np.ravel(yy)) ] )
            
    # reshape result and plot
    Z = zz.reshape(xx.shape)
    cm_bright = ListedColormap(["#EFAEA4","#F6345E"])
    
    #Get current axis and plot
    if ax is None:
        ax = plt.gca()
    ax.contourf(xx, yy, Z,cmap=cmap,alpha=0.4)
    if label:
        ax.scatter(X[:,0],X[:,1], c = y, cmap = cm_bright, s = scatter_weights * 40, edgecolors='k',label = f'Stump {counter}')
        ax.legend(fontsize=16)
    else:
        ax.scatter(X[:,0],X[:,1], c = y, cmap = cm_bright, s = scatter_weights * 40, edgecolors='k',label = f'Stump {counter}')
    ax.set_xlabel('$Latitude$', fontsize=14)
    ax.set_ylabel('$Longitude$', fontsize=14)
    ax.set_title(f'Stump {counter+1} decision boundary',fontsize=16)

In [182]:
boost = AdaBoostClassifier( base_estimator = DecisionTreeClassifier(max_depth = 1), 
                            algorithm = 'SAMME', n_estimators=9)
# x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.75, random_state=0)
# Fit on the entire data
boost.fit(x, y)

# Call the plot_decision_boundary function to plot the decision boundary of the model 
plot_decision_boundary(boost, x, y, N = 50)

plt.title('AdaBoost Decision Boundary', fontsize=16)
plt.show()

TypeError: '(slice(None, None, None), 0)' is an invalid key