In [1]:
import sys

import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import classification_report

In [2]:
def build_model(classifier_fn,                
                features, 
                label, 
                dataset, 
                test_frac=0.2):
    
    X = dataset[features]
    Y = dataset[label]
    
    x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_frac)
       
    model = classifier_fn(x_train, y_train)
    
    y_pred = model.predict(x_test)
    
    print("Features used: ", features)
    summarize_classification(y_test, y_pred)
    
    return {'model':model, 
            'x_train': x_train, 
            'y_train': y_train, 
            'x_test': x_test, 
            'y_test': y_test, 
            'y_pred': y_pred}

In [3]:
def summarize_classification(y_test, y_pred):

    report = classification_report(y_test, y_pred)

    print('Classification report')
    print("------" * 10)
    print(report)

In [11]:
def logistic_fn(x_train, y_train, penalty='l2', C=1.0, max_iter=1000):
    
    model = LogisticRegression(penalty=penalty, C=C,
                               max_iter=max_iter , solver='lbfgs')
    
    model.fit(x_train, y_train)
    
    return model

In [5]:
def decision_tree_fn(x_train, y_train, max_depth=3): 
    
    model = DecisionTreeClassifier(max_depth=max_depth)
    model.fit(x_train, y_train)
    
    return model

In [6]:
def main():

    data = pd.read_csv('./datasets/advertising_cleaned.csv')
    features = ['TimeSpent', 'Age',
                'AreaIncome','DailyInternetUsage', 
                'Male']

    try:
        model_type = sys.argv[1]
        
        if len(sys.argv) > 2:
            features = sys.argv[2:]
        
    except error:
        print("Classifier model not specified!") 
        
        
    print("Running classifier: ", model_type)

    if model_type == "logistic_regression":
        build_model(logistic_fn,
                    features,
                    'Clicked',
                    data)
    elif model_type == "decision_tree":
        build_model(decision_tree_fn,
                    features,
                    'Clicked',
                    data)
    else:
        print("Invalid classifier model")   

In [7]:
if __name__ == "__main__":
    main()
    

Running classifier:  -f
Invalid classifier model


#### remove this when you download .py file

In [8]:
data = pd.read_csv('./datasets/advertising_cleaned.csv')

In [9]:
build_model(logistic_fn,
            ['TimeSpent', 'Age',
             'AreaIncome','DailyInternetUsage', 
             'Male'],
            'Clicked',
            data)

Features used:  ['TimeSpent', 'Age', 'AreaIncome', 'DailyInternetUsage', 'Male']
Classification report
------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.85      0.94      0.89       103
           1       0.93      0.83      0.88        98

   micro avg       0.89      0.89      0.89       201
   macro avg       0.89      0.88      0.88       201
weighted avg       0.89      0.89      0.89       201



{'model': LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
           intercept_scaling=1, max_iter=100, multi_class='warn',
           n_jobs=None, penalty='l2', random_state=None, solver='lbfgs',
           tol=0.0001, verbose=0, warm_start=False),
 'x_train':      TimeSpent   Age  AreaIncome  DailyInternetUsage  Male
 240      53.68  47.0    56180.93              115.26     1
 575      37.05  39.0    49742.83              142.81     1
 156      41.73  28.0    61142.33              202.18     1
 250      61.22  45.0    63883.81              119.03     1
 152      65.40  33.0    66699.12              247.31     0
 ..         ...   ...         ...                 ...   ...
 494      81.59  35.0    65826.53              223.16     0
 497      73.94  27.0    68333.01              173.49     0
 603      50.18  35.0    63006.14              127.82     1
 236      65.65  30.0    72209.99              158.05     0
 583      56.16  25.0    66429.84              164

In [10]:
build_model(decision_tree_fn,
           ['TimeSpent', 'Age',
            'AreaIncome','DailyInternetUsage', 
            'Male'],
           'Clicked',
           data)

Features used:  ['TimeSpent', 'Age', 'AreaIncome', 'DailyInternetUsage', 'Male']
Classification report
------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.92      0.95      0.93        97
           1       0.95      0.92      0.94       104

   micro avg       0.94      0.94      0.94       201
   macro avg       0.94      0.94      0.94       201
weighted avg       0.94      0.94      0.94       201



{'model': DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
             max_features=None, max_leaf_nodes=None,
             min_impurity_decrease=0.0, min_impurity_split=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
             splitter='best'),
 'x_train':      TimeSpent   Age  AreaIncome  DailyInternetUsage  Male
 699      58.60  19.0    44490.09              197.93     1
 492      59.70  28.0    49158.50              120.25     0
 867      90.75  40.0    45945.88              216.50     0
 113      69.00  32.0    72683.35              221.21     1
 753      76.87  36.0    72948.76              212.59     0
 ..         ...   ...         ...                 ...   ...
 227      70.68  31.0    74430.08              199.08     0
 843      74.61  38.0    71055.22              231.28     1
 292      81.61  33.0    62667.51              228.76     0
 459      48.03  40.0    2