In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
from sklearn.metrics import (
        make_scorer,
        confusion_matrix, 
        cohen_kappa_score, 
        accuracy_score, 
        precision_score, 
        recall_score, 
        f1_score, 
        roc_auc_score
)
from sklearn.model_selection import cross_validate, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier # decision trees for classification
from sklearn.neural_network import  MLPClassifier # neural networks for classification
from sklearn.naive_bayes import GaussianNB # naive bayes for classification
from sklearn.svm import SVC # support vector machines for classification

In [3]:
def specificity_score(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    return tn / (tn+fp)

In [4]:
METRICS = {
        "accuracy": make_scorer(accuracy_score),
        "precision": make_scorer(precision_score),
        "recall": make_scorer(recall_score),
        "f1": make_scorer(f1_score),
        "AUC": make_scorer(roc_auc_score, needs_proba=True),
        "specificity": make_scorer(specificity_score),
        "kappa":make_scorer(cohen_kappa_score)
}

In [5]:
data=pd.read_csv('/kaggle/input/dataset2/Final_Datasets_2.csv')
data

Unnamed: 0,article_id,price,sales_channel_id,sale,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,...,department_name,index_code,index_name,index_group_name,section_name,garment_group_name,detail_desc,club_member_status,fashion_news_frequency,postal_code
0,529008044,0.024288,2,1,529008,306,1010016,31,1,3,...,66.0,1.0,7.0,2.0,46.0,18.0,14284.0,0.0,1.0,47227.0
1,537688014,0.040661,2,1,537688,252,1010010,8,4,12,...,124.0,0.0,6.0,2.0,44.0,7.0,15572.0,0.0,2.0,239246.0
2,872298001,0.006085,1,1,872298,253,1010016,10,3,9,...,201.0,3.0,4.0,1.0,12.0,6.0,6493.0,0.0,2.0,97586.0
3,562455002,0.025407,2,1,562455,265,1010001,9,4,5,...,239.0,7.0,2.0,0.0,17.0,5.0,16094.0,0.0,1.0,169279.0
4,504154034,0.015237,2,1,504154,252,1010016,73,4,2,...,124.0,0.0,6.0,2.0,44.0,7.0,35830.0,0.0,2.0,199216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1229995,664405002,0.012576,1,1,664405,67,1010016,9,4,5,...,29.0,2.0,5.0,2.0,41.0,0.0,2644.0,0.0,1.0,202541.0
1229996,685816002,0.008458,2,1,685816,255,1010016,9,4,5,...,132.0,4.0,8.0,3.0,38.0,5.0,20990.0,0.0,1.0,20379.0
1229997,717874003,0.042356,2,1,717874,57,1010017,10,3,9,...,194.0,1.0,7.0,2.0,51.0,15.0,9605.0,0.0,2.0,203103.0
1229998,351484027,0.017610,2,1,351484,59,1010016,42,5,18,...,194.0,1.0,7.0,2.0,51.0,15.0,9363.0,0.0,1.0,192480.0


In [6]:
X, y = data.drop(["sale"], axis=1), data["sale"]

In [7]:
X 

Unnamed: 0,article_id,price,sales_channel_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,...,department_name,index_code,index_name,index_group_name,section_name,garment_group_name,detail_desc,club_member_status,fashion_news_frequency,postal_code
0,529008044,0.024288,2,529008,306,1010016,31,1,3,1338,...,66.0,1.0,7.0,2.0,46.0,18.0,14284.0,0.0,1.0,47227.0
1,537688014,0.040661,2,537688,252,1010010,8,4,12,1626,...,124.0,0.0,6.0,2.0,44.0,7.0,15572.0,0.0,2.0,239246.0
2,872298001,0.006085,1,872298,253,1010016,10,3,9,1640,...,201.0,3.0,4.0,1.0,12.0,6.0,6493.0,0.0,2.0,97586.0
3,562455002,0.025407,2,562455,265,1010001,9,4,5,7930,...,239.0,7.0,2.0,0.0,17.0,5.0,16094.0,0.0,1.0,169279.0
4,504154034,0.015237,2,504154,252,1010016,73,4,2,1626,...,124.0,0.0,6.0,2.0,44.0,7.0,35830.0,0.0,2.0,199216.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1229995,664405002,0.012576,1,664405,67,1010016,9,4,5,3509,...,29.0,2.0,5.0,2.0,41.0,0.0,2644.0,0.0,1.0,202541.0
1229996,685816002,0.008458,2,685816,255,1010016,9,4,5,5832,...,132.0,4.0,8.0,3.0,38.0,5.0,20990.0,0.0,1.0,20379.0
1229997,717874003,0.042356,2,717874,57,1010017,10,3,9,4242,...,194.0,1.0,7.0,2.0,51.0,15.0,9605.0,0.0,2.0,203103.0
1229998,351484027,0.017610,2,351484,59,1010016,42,5,18,4242,...,194.0,1.0,7.0,2.0,51.0,15.0,9363.0,0.0,1.0,192480.0


In [8]:
y

0          1
1          1
2          1
3          1
4          1
          ..
1229995    1
1229996    1
1229997    1
1229998    1
1229999    1
Name: sale, Length: 1230000, dtype: int64

In [9]:
from sklearn.preprocessing import RobustScaler
colunas=X.columns

scaler=RobustScaler()

X[colunas]=scaler.fit_transform(X[colunas])
X

Unnamed: 0,article_id,price,sales_channel_id,product_code,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,...,department_name,index_code,index_name,index_group_name,section_name,garment_group_name,detail_desc,club_member_status,fashion_news_frequency,postal_code
0,-1.198131,-0.060440,0.0,-1.198131,2.10,0.000000,0.617647,-1.5,-0.333333,-0.162521,...,-0.423358,0.000000,0.333333,0.0,0.068966,1.1,-0.126728,0.0,0.0,-0.595913
1,-1.142030,0.824176,0.0,-1.142030,-0.60,-1.000000,-0.058824,0.0,1.166667,-0.039022,...,0.000000,-0.333333,0.000000,0.0,0.000000,0.0,-0.058232,0.0,1.0,0.882954
2,1.020637,-1.043956,-1.0,1.020637,-0.55,0.000000,0.000000,-0.5,0.666667,-0.033019,...,0.562044,0.666667,-0.666667,-1.0,-1.103448,-0.1,-0.541055,0.0,1.0,-0.208064
3,-0.981955,0.000000,0.0,-0.981955,0.05,-2.500000,-0.029412,0.0,0.000000,2.664237,...,0.839416,2.000000,-1.333333,-2.0,-0.931034,-0.2,-0.030472,0.0,0.0,0.344091
4,-1.358768,-0.549451,0.0,-1.358768,-0.60,0.000000,1.852941,0.0,-0.500000,-0.039022,...,0.000000,-0.333333,0.000000,0.0,0.000000,0.0,1.019092,0.0,1.0,0.574656
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1229995,-0.323027,-0.693223,-1.0,-0.323027,-9.85,0.000000,-0.029412,0.0,0.000000,0.768439,...,-0.693431,0.333333,-0.333333,0.0,-0.103448,-0.7,-0.745746,0.0,0.0,0.600264
1229996,-0.184642,-0.915751,0.0,-0.184642,-0.45,0.000000,-0.029412,0.0,0.000000,1.764580,...,0.058394,1.000000,0.666667,1.0,-0.206897,-0.2,0.229898,0.0,0.0,-0.802687
1229997,0.022557,0.915751,0.0,0.022557,-10.35,0.166667,0.000000,-0.5,0.666667,1.082762,...,0.510949,0.000000,0.333333,0.0,0.241379,0.8,-0.375558,0.0,1.0,0.604593
1229998,-2.345512,-0.421245,0.0,-2.345512,-10.25,0.000000,0.941176,0.5,2.166667,1.082762,...,0.510949,0.000000,0.333333,0.0,0.241379,0.8,-0.388428,0.0,0.0,0.522778


In [10]:
splitter = StratifiedKFold(n_splits=10, shuffle=True, random_state=1234)

In [11]:
dt = DecisionTreeClassifier(max_depth=10, random_state=1234)
splitter = StratifiedKFold(10, random_state=1234, shuffle=True)
scores = cross_validate(dt, X, y, cv=splitter, scoring=METRICS)
dt_scores = pd.DataFrame(scores)
pd.DataFrame(dt_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,48.251685,0.262638,0.977481,0.980348,0.996903,0.988556,0.862988,0.200633,0.294346


In [12]:
nn = MLPClassifier(hidden_layer_sizes=(50,50), max_iter=20, random_state=1234)
scores_nn = cross_validate(nn, X, y, cv=splitter, scoring=METRICS)
nn_scores = pd.DataFrame(scores_nn)
pd.DataFrame(nn_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,352.242686,1.40151,0.977437,0.979414,0.997846,0.988544,0.885455,0.161067,0.250667


In [13]:
nb = GaussianNB()
scores_nb = cross_validate(nb, X, y, cv=splitter, scoring=METRICS)
nb_scores = pd.DataFrame(scores_nb)
pd.DataFrame(nb_scores.mean()).T

Unnamed: 0,fit_time,score_time,test_accuracy,test_precision,test_recall,test_f1,test_AUC,test_specificity,test_kappa
0,1.12897,0.349768,0.922091,0.986292,0.933113,0.958965,0.816385,0.481233,0.201963


In [None]:
svm = SVC(random_state=1234, probability=True)
scores_svm = cross_validate(svm, X, y, cv=splitter, scoring=METRICS)
svm_scores = pd.DataFrame(scores_svm)
pd.DataFrame(svm_scores.mean()).T