In [34]:
import numpy as np  
import matplotlib.pyplot as plt  
import pandas as pd  
from sklearn.preprocessing import Binarizer, LabelEncoder, OneHotEncoder
from sklearn.cross_validation import train_test_split
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score

In [2]:
data = pd.read_csv('sales_loss_win_data.csv')

In [3]:
data = data.drop('Opportunity Number', axis =1 )

In [8]:
le = LabelEncoder()

In [9]:
class MultiColumnLabelEncoder:
    def __init__(self,columns = None):
        self.columns = columns # array of column names to encode

    def fit(self,X,y=None):
        return self # not relevant here

    def transform(self,X):
        '''
        Transforms columns of X specified in self.columns using
        LabelEncoder(). If no columns specified, transforms all
        columns in X.
        '''
        output = X.copy()
        if self.columns is not None:
            for col in self.columns:
                output[col] = LabelEncoder().fit_transform(output[col])
        else:
            for colname,col in output.iteritems():
                output[colname] = LabelEncoder().fit_transform(col)
        return output

    def fit_transform(self,X,y=None):
        return self.fit(X,y).transform(X)

In [10]:
final_data = MultiColumnLabelEncoder(columns = ['Supplies Subgroup', 'Supplies Group', 'Region', 'Route To Market', 
                                  'Opportunity Result', 'Competitor Type']).fit_transform(data)

## k-nearest neighbors

The principle behind nearest neighbor methods is to find a predefined number of training samples closest in distance to the new point, and predict the label from these. The number of samples can be a user-defined constant (k-nearest neighbor learning), or vary based on the local density of points (radius-based neighbor learning). The distance can, in general, be any metric measure: standard Euclidean distance is the most common choice. Neighbors-based methods are known as non-generalizing machine learning methods, since they simply “remember” all of its training data.

Source: https://scikit-learn.org/stable/modules/neighbors.html#neighbors

In [12]:
yVar = final_data['Opportunity Result']
xVar = final_data.loc[:, final_data.columns != 'Opportunity Result']

In [13]:
X_train, X_test, y_train, y_test = train_test_split(xVar, yVar, test_size=0.2)
print (X_train.shape, y_train.shape)
print (X_test.shape, y_test.shape)

(62420, 17) (62420,)
(15605, 17) (15605,)


In [26]:
classifier = KNeighborsClassifier(n_neighbors=5)  
model = classifier.fit(X_train, y_train)  

In [36]:
y_pred = model.predict(X_test)  

In [38]:
pd.crosstab(y_test, y_pred, rownames=['Actual Result'], colnames=['Predicted Result'])

Predicted Result,0,1
Actual Result,Unnamed: 1_level_1,Unnamed: 2_level_1
0,11025,1035
1,1823,1722


In [37]:
print ("Accuracy is ", accuracy_score(y_test,y_pred)*100)

Accuracy is  81.6853572573


## Bagging

In ensemble algorithms, bagging methods form a class of algorithms which build several instances of a black-box estimator on random subsets of the original training set and then aggregate their individual predictions to form a final prediction. These methods are used as a way to reduce the variance of a base estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it. In many cases, bagging methods constitute a very simple way to improve with respect to a single model, without making it necessary to adapt the underlying base algorithm. As they provide a way to reduce overfitting, bagging methods work best with strong and complex models (e.g., fully developed decision trees), in contrast with boosting methods which usually work best with weak models (e.g., shallow decision trees).

In [24]:
bagging = BaggingClassifier(classifier, max_samples=0.5, max_features=0.5)

In [29]:
bm = bagging.fit(X_train, y_train)  

In [31]:
y_pred_bm = bm.predict(X_test) 

In [32]:
print(pd.crosstab(y_test, y_pred_bm, rownames=['Actual Result'], colnames=['Predicted Result']))  

Predicted Result      0     1
Actual Result                
0                 11786   274
1                  2182  1363


In [35]:
print ("Accuracy is ", accuracy_score(y_test,y_pred_bm)*100)

Accuracy is  84.261454662
