In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [3]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [4]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

In [5]:
data=pd.read_csv('Final_Datasets.csv')
data

Unnamed: 0,article_id,price,sales_channel_id,sale,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,...,garment_group_no,age,customer_id,product_group_name,index_code,index_name,detail_desc,club_member_status,fashion_news_frequency,postal_code
0,529008044,0.024288,2,1,529008,306,1010016,31,1,3,...,1017,34.0,523236.0,16.0,1.0,7.0,14284.0,0.0,1.0,47227.0
1,537688014,0.040661,2,1,537688,252,1010010,8,4,12,...,1003,29.0,192839.0,7.0,0.0,6.0,15572.0,0.0,2.0,239246.0
2,872298001,0.006085,1,1,872298,253,1010016,10,3,9,...,1005,40.0,107702.0,7.0,3.0,4.0,6493.0,0.0,2.0,97586.0
3,562455002,0.025407,2,1,562455,265,1010001,9,4,5,...,1002,27.0,220255.0,5.0,7.0,2.0,16094.0,0.0,1.0,169279.0
4,504154034,0.015237,2,1,504154,252,1010016,73,4,2,...,1003,61.0,35058.0,7.0,0.0,6.0,35830.0,0.0,2.0,199216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1222963,664405002,0.012576,1,1,664405,67,1010016,9,4,5,...,1019,54.0,403023.0,0.0,2.0,5.0,2644.0,0.0,1.0,202541.0
1222964,685816002,0.008458,2,1,685816,255,1010016,9,4,5,...,1002,22.0,376861.0,7.0,4.0,8.0,20990.0,0.0,1.0,20379.0
1222965,717874003,0.042356,2,1,717874,57,1010017,10,3,9,...,1018,21.0,38934.0,15.0,1.0,7.0,9605.0,0.0,2.0,203103.0
1222966,351484027,0.017610,2,1,351484,59,1010016,42,5,18,...,1018,22.0,69007.0,15.0,1.0,7.0,9363.0,0.0,1.0,192480.0


In [6]:
X, y = data[["article_id", "product_code", "customer_id", "price"]], data["sale"]

In [7]:
X 

Unnamed: 0,article_id,product_code,customer_id,price
0,529008044,529008,523236.0,0.024288
1,537688014,537688,192839.0,0.040661
2,872298001,872298,107702.0,0.006085
3,562455002,562455,220255.0,0.025407
4,504154034,504154,35058.0,0.015237
...,...,...,...,...
1222963,664405002,664405,403023.0,0.012576
1222964,685816002,685816,376861.0,0.008458
1222965,717874003,717874,38934.0,0.042356
1222966,351484027,351484,69007.0,0.017610


In [8]:
y

0          1
1          1
2          1
3          1
4          1
          ..
1222963    1
1222964    1
1222965    1
1222966    1
1222967    1
Name: sale, Length: 1222968, dtype: int64

In [9]:
data.dtypes

article_id                      int64
price                         float64
sales_channel_id                int64
sale                            int64
product_code                    int64
product_type_no                 int64
graphical_appearance_no         int64
colour_group_code               int64
perceived_colour_value_id       int64
perceived_colour_master_id      int64
department_no                   int64
index_group_no                  int64
section_no                      int64
garment_group_no                int64
age                           float64
customer_id                   float64
product_group_name            float64
index_code                    float64
index_name                    float64
detail_desc                   float64
club_member_status            float64
fashion_news_frequency        float64
postal_code                   float64
dtype: object

In [10]:
data=data[["article_id", "product_code", "customer_id", "price", "sale"]]

In [11]:
data.to_csv("Final_Final.csv", index=False)

In [12]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [13]:
dt = DecisionTreeClassifier(max_depth=15, random_state=1234)
splitter = StratifiedKFold(10, random_state=1234, shuffle=True)
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,6.337496,0.157853,0.976883,0.978169,0.998589,0.988273,0.902731,0.113733,0.188598


In [None]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

In [None]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

In [None]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T