In [2]:
# Import the libraries we will be using
import sys
sys.path.append('/usr/local/lib/python2.7/site-packages')
import os
import numpy as np
import pandas as pd
import math
import matplotlib.patches as patches
import matplotlib.pylab as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn import metrics
from sklearn import datasets
from IPython.display import Image

%matplotlib inline

# Task 1: Build a classifier using decision tree

In [3]:
# A function that gives a visual representation of the decision tree
def Decision_Tree_Image(decision_tree, feature_names, name="temp"):
    # Export our decision tree to graphviz format
    dot_file = tree.export_graphviz(decision_tree, out_file='images/' + name + '.dot', feature_names=feature_names)
    
    # Call graphviz to make an image file from our decision tree
    os.system("dot -Tpng images/" + name + ".dot -o images/" + name + ".png")
    
    # Return the .png image so we can see it
    return Image(filename='images/' + name + '.png')

# A function to plot the data
def Plot_Data(data, v1, v2, tv):
    # Make the plot square
    plt.rcParams['figure.figsize'] = [12.0, 8.0]
    
    # Color
    color = ["red" if x == 0 else "blue" for x in data[tv]]
    
    # Plot and label
    plt.scatter(data[v1], data[v2], c=color, s=50)
    plt.xlabel(v1)
    plt.ylabel(v2)
    plt.xlim([min(data[v1]) - 1, max(data[v1]) + 1])
    plt.ylim([min(data[v2]) - .05, max(data[v2]) + .05])
    
def Decision_Surface(x, y, model, cell_size=.01):
    # Get blob sizes for shading
    x = (min(x), max(x))
    y = (min(y), max(y))
    x_step = (x[1] - x[0]) * cell_size
    y_step = (y[1] - y[0]) * cell_size

    # Create blobs
    x_values = []
    y_values = []
    
    for i in np.arange(x[0], x[1], x_step):
        for j in np.arange(y[0], y[1], y_step):
            y_values.append(float(i))
            x_values.append(float(j))
    
    data_blob = pd.DataFrame({"x": x_values, "y": y_values})

    # Predict the blob labels
    label= decision_tree.predict(data_blob)
    
    # Color and plot them
    color = ["red" if l == 0 else "blue" for l in label]
    plt.scatter(data_blob['y'], data_blob['x'], marker='o', edgecolor='black', linewidth='0', c=color, alpha=0.3)
    
    # Get the raw decision tree rules
    decision_tree_raw = []
    for feature, left_c, right_c, threshold, value in zip(decision_tree.tree_.feature, 
                                                          decision_tree.tree_.children_left, 
                                                          decision_tree.tree_.children_right, 
                                                          decision_tree.tree_.threshold, 
                                                          decision_tree.tree_.value):
        decision_tree_raw.append([feature, left_c, right_c, threshold, value])

    # Plot the data
    Plot_Data(data, "humor", "number_pets", "success")

    # Used for formatting the boundry lines
    currentAxis = plt.gca()
    line_color = "black"
    line_width = 3

    # For each rule
    for row in decision_tree_raw:
        feature, left_c, right_c, threshold, value = row

        if threshold != -2:
            if feature == 0:
                plt.plot([20, 100], [threshold, threshold], c=line_color, linewidth=line_width)
            else:
                plt.plot([threshold, threshold], [0, 5], c=line_color, linewidth=line_width)

    plt.xlim([min(x) - 1, max(x) + 1])
    plt.ylim([min(y) - .05, max(y) + .05])
    plt.show()

In [4]:
vec=pd.read_csv("homework3.csv")
vec.head()

Unnamed: 0,check_sum,compile_date,datadir_IMAGE_DIRECTORY_ENTRY_BASERELOC_size,datadir_IMAGE_DIRECTORY_ENTRY_EXPORT_size,datadir_IMAGE_DIRECTORY_ENTRY_IAT_size,datadir_IMAGE_DIRECTORY_ENTRY_IMPORT_size,datadir_IMAGE_DIRECTORY_ENTRY_RESOURCE_size,debug_size,export_size,generated_check_sum,...,sec_vasize_upx3,size_code,size_image,size_initdata,size_uninit,std_section_names,total_size_pe,virtual_address,virtual_size,virtual_size_2
0,0,585810474,0,0,44,40,0,0,0,98624,...,0.0,13824,180224,43008,65536,0,85504,4096,13352,65536
1,0,1218437803,0,0,468,100,1048,0,0,53913,...,0.0,4096,20480,12288,0,1,20480,4096,3346,2182
2,98299,1297813288,0,0,372,40,1660,28,0,113512,...,0.0,36864,53248,12288,0,1,64512,4096,33504,5156
3,104924,708992537,4612,0,0,2842,6144,0,0,104924,...,0.0,67072,114688,13824,0,0,81920,4096,66596,456
4,150326,1276781438,188,154,308,180,84152,0,154,150326,...,0.0,6656,110592,89088,0,0,97280,4096,6532,2074


In [5]:
col_names = []
for s in vec.columns:
    if s != "label":
        col_names.append(s)
print col_names[0]

check_sum


In [6]:
X = vec[col_names]
Y = vec["label"]

In [30]:
from sklearn.cross_validation import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=2)

In [31]:
decision_tree = DecisionTreeClassifier(max_depth=3, criterion="entropy")
decision_tree.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=3,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [32]:
print "Accuracy = %.3f" % (metrics.accuracy_score(decision_tree.predict(X_test), Y_test))

Accuracy = 0.920


# Task2: 10 Most important features

In [33]:
def entropy(target):
    # Get the number of users
    n = len(target)
    # Count how frequently each unique value occurs
    counts = np.bincount(target).astype(float)
    # Initialize entropy
    entropy = 0
    # If the split is perfect, return 0
    if len(counts) <= 1 or 0 in counts:
        return entropy
    # Otherwise, for each possible value, update entropy
    for count in counts:
        entropy += math.log(count/n, len(counts)) * count/n
    # Return entropy
    return -1 * entropy

def information_gain(feature, threshold, target):
    # Dealing with numpy arrays makes this slightly easier
    target = np.array(target)
    feature = np.array(feature)
    # Cut the feature vector on the threshold
    feature = (feature < threshold)
    # Initialize information gain with the parent entropy
    ig = entropy(target)
    # For both sides of the threshold, update information gain
    for level, count in zip([0, 1], np.bincount(feature).astype(float)):
        ig -= count/len(feature) * entropy(target[feature == level])
    # Return information gain
    return ig

In [52]:
target_label = []
for i in Y:
    if i == "good":
        target_label.append(0)
    else:
        target_label.append(1)

In [55]:
feature_ranking = {}
threshold = 3.2
for i in X.columns:
    score = information_gain(X[i], threshold, target_label)
    feature_ranking[i] = score

223

In [62]:
ranked = sorted(feature_ranking.items(), lambda x, y: cmp(x[1], y[1]), reverse=True)

In [77]:
for i in range(10):
    print(ranked[i])

('debug_size', 0.46844462813835852)
('sec_rawsize_reloc', 0.3226822647613829)
('sec_rawptr_reloc', 0.30291347056088036)
('sec_vasize_reloc', 0.30291347056088036)
('datadir_IMAGE_DIRECTORY_ENTRY_BASERELOC_size', 0.28406893289906648)
('check_sum', 0.25760895169765585)
('sec_entropy_reloc', 0.1983440151760999)
('pe_minorlink', 0.19124714854527902)
('sec_vasize_text', 0.17378379914476594)
('sec_rawptr_text', 0.17378379914476594)


# Task3: Two logistic models