In [200]:
import csv
import sys

from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
import pandas as pd
import numpy as np

TEST_SIZE = 0.4

def main():

    # Check command-line arguments
    if len(sys.argv) != 2:
        sys.exit("Usage: python shopping.py data")

    # Load data from spreadsheet and split into train and test sets
    evidence, labels = load_data(sys.argv[1])
    X_train, X_test, y_train, y_test = train_test_split(
        evidence, labels, test_size=TEST_SIZE
    )

    # Train model and make predictions
    model = train_model(X_train, y_train)
    predictions = model.predict(X_test)
    sensitivity, specificity = evaluate(y_test, predictions)

    # Print results
    print(f"Correct: {(y_test == predictions).sum()}")
    print(f"Incorrect: {(y_test != predictions).sum()}")
    print(f"True Positive Rate: {100 * sensitivity:.2f}%")
    print(f"True Negative Rate: {100 * specificity:.2f}%")


def load_data(filename):
    """
    Load shopping data from a CSV file `filename` and convert into a list of
    evidence lists and a list of labels. Return a tuple (evidence, labels).
    evidence should be a list of lists, where each list contains the
    following values, in order:
        - Administrative, an integer
        - Administrative_Duration, a floating point number
        - Informational, an integer
        - Informational_Duration, a floating point number
        - ProductRelated, an integer
        - ProductRelated_Duration, a floating point number
        - BounceRates, a floating point number
        - ExitRates, a floating point number
        - PageValues, a floating point number
        - SpecialDay, a floating point number
        - Month, an index from 0 (January) to 11 (December)
        - OperatingSystems, an integer
        - Browser, an integer
        - Region, an integer
        - TrafficType, an integer
        - VisitorType, an integer 0 (not returning) or 1 (returning)
        - Weekend, an integer 0 (if false) or 1 (if true)

    labels should be the corresponding list of labels, where each label
    is 1 if Revenue is true, and 0 otherwise.
    """
    df = pd.read_csv(filename)
    #
    df.Administrative = df.Administrative.astype('int')
    df.Informational = df.Informational.astype('int')
    df.ProductRelated = df.ProductRelated.astype('int')
    df['Month'] = df['Month'].map({'Feb':2, 'Mar':3, 'May':5, 'Oct':10, 'June':6, 'Jul':7, 'Aug':8, \
                                   'Nov':11, 'Sep':10,'Dec':12})
    df.OperatingSystems = df.OperatingSystems.astype('int')
    df.Browser = df.Browser.astype('int')
    df.Region = df.Region.astype('int')
    df.TrafficType = df.TrafficType.astype('int')
    df.TrafficType = df.TrafficType.astype('int')
    df['VisitorType'] = df['VisitorType'].map({'Returning_Visitor':1, 'New_Visitor':2, 'Other':3})
    df['Weekend'] = df['Weekend'].map({True:1, False:0})
    df['Revenue'] = df['Revenue'].map({True:1, False:0})
    
    evidence = df.drop('Revenue', axis=1).values
    labels = df.Revenue
    
    return evidence, labels


def train_model(evidence, labels):
    """
    Given a list of evidence lists and a list of labels, return a
    fitted k-nearest neighbor model (k=1) trained on the data.
    """    
    return KNeighborsClassifier(n_neighbors=1).fit(evidence, labels)
    

def evaluate(labels, predictions):
    """
    Given a list of actual labels and a list of predicted labels,
    return a tuple (sensitivity, specificty).

    Assume each label is either a 1 (positive) or 0 (negative).

    `sensitivity` should be a floating-point value from 0 to 1
    representing the "true positive rate": the proportion of
    actual positive labels that were accurately identified.

    `specificity` should be a floating-point value from 0 to 1
    representing the "true negative rate": the proportion of
    actual negative labels that were accurately identified.
    """
    tpr = sum(labels == 1)
    tnr = sum(labels == 0)
    
    return (tpr / sum(labels == predictions)), (tnr / sum(labels == predictions))


filename = 'shopping.csv'
evidence, labels = load_data(filename)

cls = train_model(evidence, labels)

pred = cls.predict(evidence)

evaluate(labels, pred)

(0.15474452554744525, 0.8452554744525548)

In [201]:
true_positive = 0.000000001
true_negative = 0.000000001
false_negative = 0.000000001
false_positive = 0.000000001

for i in zip(labels, pred):
    if sum(i) == 2:
        true_positive += 1
    elif sum(i) == 0:
        false_negative += 1
    elif i[0] == 1 or sum(i) == 1:
        true_negative += 1
    elif i[0] == 0 or sum(i) == 1:
        false_positive += 1
    else:
        print('no values')
    
        
sensivity = true_positive / (true_positive + false_negative)

specificity = true_negative / (true_negative + false_positive)

        
true_positive, true_negative, false_negative, false_positive, sensivity, specificity

(1908.000000001, 1e-09, 10422.000000001, 1e-09, 0.15474452554750123, 0.5)

In [202]:
confusion_matrix(labels, pred)

array([[10422,     0],
       [    0,  1908]])