In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import HistGradientBoostingClassifier
from scipy.sparse import csr_matrix, find
from scipy.io import loadmat
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer

import numpy as np
import json

In [2]:
mat = loadmat('ComGA-master/data/Flickr/Flickr.mat')

In [3]:
mat

{'__header__': b'MATLAB 5.0 MAT-file Platform: posix, Created on: Wed Sep 12 18:20:30 2018',
 '__version__': '1.0',
 '__globals__': [],
 'Network': <7575x7575 sparse matrix of type '<class 'numpy.float64'>'
 	with 482555 stored elements in Compressed Sparse Column format>,
 'Label': array([[0],
        [0],
        [0],
        ...,
        [0],
        [0],
        [0]], dtype=uint8),
 'Attributes': <7575x12047 sparse matrix of type '<class 'numpy.float64'>'
 	with 225667 stored elements in Compressed Sparse Column format>,
 'Class': array([[8],
        [2],
        [7],
        ...,
        [3],
        [6],
        [4]], dtype=uint8)}

In [4]:
network = pd.DataFrame(np.sum(mat['Network'].toarray(), axis=0))

In [5]:
network = network.rename(columns={0:'NbNeighbors'})

In [6]:
network

Unnamed: 0,NbNeighbors
0,1881.0
1,1801.0
2,1822.0
3,1797.0
4,1593.0
...,...
7570,4.0
7571,2.0
7572,4.0
7573,4.0


In [7]:
df = pd.DataFrame(mat['Attributes'].toarray()).join(pd.DataFrame(mat['Class']).rename(columns={0:'Class'})).join(pd.DataFrame(mat['Label']).rename(columns={0:'Label'})).join(network)

In [8]:
df['Class'] = df['Class'].astype(str)

In [9]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12040,12041,12042,12043,12044,12045,12046,Class,Label,NbNeighbors
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.103695,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8,0,1881.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,1801.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7,0,1822.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1,0,1797.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2,0,1593.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0,4.0
7571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0,2.0
7572,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3,0,4.0
7573,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6,0,4.0


In [10]:
def __split_data_set(data: pd.DataFrame, y_col, validation_size=0.35):
    """

    :param data: Pandas DataFrame à diviser en ensemble test et ensemble validation
    :param y_col: nom de la colonne à isoler comme variable cible
    :param validation_size: taux indicant la taille de l'ensemble de validation
    :return: X_train, X_validation, Y_train, Y_validation
    """
    # Split-out validation dataset
    X = data.loc[:, data.columns != y_col]
    Y = data[[y_col]].values.astype('int')
    seed = 7
    return train_test_split(X, Y, test_size=validation_size, random_state=seed)

In [11]:
def estimate_accuracy(model, X_validation, Y_validation):
    predictions = model.predict(X_validation)
    print(accuracy_score(Y_validation, predictions))
    print(confusion_matrix(Y_validation, predictions))
    print(classification_report(Y_validation, predictions))

In [12]:
df.columns = df.columns.astype(str)

In [13]:
categorical_columns = list()
for column in df.columns:
    if df.dtypes[column] == object:
        categorical_columns.append(column)

In [14]:
categorical_columns

['Class']

In [15]:
# Transcodage des variables catégorielles pour le modèle
dictionaries = list()
for category in categorical_columns:
    unique_values = pd.DataFrame(pd.unique(df[category])).rename(
        columns={'index': category})
    unique_values = unique_values.reset_index().rename(columns={'index': category})
    dictionaries.append((category, unique_values))
    df = df.merge(
        unique_values, left_on=category, right_on=0, how='left').drop(columns=[category, category + '_x', 0]).rename(
        columns={category + '_y': category})

In [16]:
X_train, X_validation, Y_train, Y_validation = __split_data_set(df,'Label')

In [17]:
transformer = make_column_transformer(
    (PCA(), [i for i in df.columns if i not in categorical_columns and i!='Label' and i !='NbNeighbors']),
    remainder='passthrough')

In [18]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12039,12040,12041,12042,12043,12044,12045,12046,NbNeighbors,Class
229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,333.0,3
2575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,3
3935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,4
2175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,5
2969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,7
2550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,6
537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,3
1220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92.0,2


In [19]:
transformer.fit(X_train, Y_train.ravel())

In [20]:
column_names_all = list(X_train.columns)
check = [i in categorical_columns for i in column_names_all]
model = HistGradientBoostingClassifier(categorical_features=[transformer.get_feature_names_out().shape[0] - (i + 1) for i in range(0, len(categorical_columns))], early_stopping=True, verbose=1)
pipeline = Pipeline([ ('PCA', transformer), ('HistGradientBoosting', model)])

In [21]:
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12039,12040,12041,12042,12043,12044,12045,12046,NbNeighbors,Class
229,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,333.0,3
2575,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,3
3935,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23.0,4
2175,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,48.0,5
2969,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,7
2550,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,6
537,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,189.0,3
1220,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,92.0,2


In [22]:
pipeline.fit(X_train, Y_train.ravel())

Binning 0.175 GB of training data: 4.952 s
Binning 0.019 GB of validation data: 0.065 s
Fitting gradient boosted rounds:
[1/100] 1 tree, 31 leaves, max depth = 17, train loss: 0.17745, val loss: 0.18747, in 0.523s
[2/100] 1 tree, 31 leaves, max depth = 14, train loss: 0.15609, val loss: 0.18198, in 0.584s
[3/100] 1 tree, 31 leaves, max depth = 12, train loss: 0.14038, val loss: 0.17561, in 0.621s
[4/100] 1 tree, 31 leaves, max depth = 14, train loss: 0.12725, val loss: 0.16988, in 0.560s
[5/100] 1 tree, 31 leaves, max depth = 11, train loss: 0.11562, val loss: 0.16583, in 0.631s
[6/100] 1 tree, 31 leaves, max depth = 10, train loss: 0.10579, val loss: 0.16132, in 0.612s
[7/100] 1 tree, 31 leaves, max depth = 10, train loss: 0.09744, val loss: 0.15811, in 0.622s
[8/100] 1 tree, 31 leaves, max depth = 11, train loss: 0.08961, val loss: 0.15573, in 0.640s
[9/100] 1 tree, 31 leaves, max depth = 9, train loss: 0.08313, val loss: 0.15293, in 0.638s
[10/100] 1 tree, 31 leaves, max depth = 11,

In [23]:
estimate_accuracy(pipeline, X_validation, Y_validation)

0.9343891402714932
[[2433   60]
 [ 114   45]]
              precision    recall  f1-score   support

           0       0.96      0.98      0.97      2493
           1       0.43      0.28      0.34       159

    accuracy                           0.93      2652
   macro avg       0.69      0.63      0.65      2652
weighted avg       0.92      0.93      0.93      2652



In [24]:
predictions = pipeline.predict(X_validation)
report = classification_report(Y_validation, predictions, output_dict = True)

with open('Flickr_classification_report.json', 'w') as outfile:
    json.dump(report, outfile)