## Déploiement et données massives
#### RANIA BEN DHIA <rania.ben.dhia@etu.univ-poitiers.fr>	
#### MARWAN AL OMARI <marwan.al.omari@etu.univ-poitiers.fr>

In [6]:
import warnings
warnings.filterwarnings('ignore')

## Utilisation de Orange en langage script

In [7]:
conda install -c ales-erjavec orange3

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.


Note: you may need to restart the kernel to use updated packages.


In [8]:
#importer le package Orange
import Orange

# Ouvre le jeu de données wine.tab, et afficher l’ensemble de ses attributs:
# le nom de la classe cible
# le nombre d’échantillons

data = Orange.data.Table("wine")
print("\nAttributes: ")
print("Attributes:", ", ".join(x.name for x in data.domain.attributes))
print("\nle nom de la classe cible: ")
print("Class:", data.domain.class_var.name)
print("\nle nombre d’échantillons: ")
print("Data instances", len(data))


Attributes: 
Attributes: Alcohol, Malic Acid, Ash, Alcalinity of ash, Magnesium, Total phenols, Flavanoids, Nonflavanoid phenols, Proanthocyanins, Color intensity, Hue, OD280/OD315 of diluted wines, Proline

le nom de la classe cible: 
Class: Wine

le nombre d’échantillons: 
Data instances 178


In [9]:
# instance de la classe objet Orange.classification.TreeLearner pour
# apprendre un classifieur basé sur un arbre de décision

lr = Orange.classification.LogisticRegressionLearner()
rf = Orange.classification.RandomForestLearner(n_estimators=100)
res = Orange.evaluation.CrossValidation(data, [lr, rf], k=2)

print("Accuracy:", Orange.evaluation.scoring.CA(res))
print("AUC:", Orange.evaluation.scoring.AUC(res))

Accuracy: [0.94382022 0.97191011]
AUC: [0.98983603 0.99815419]


In [10]:
# l’instance de classe TreeLearner
learner = Orange.classification.tree.TreeLearner(max_depth=3)
print(learner.params)

{'binarize': False, 'min_samples_leaf': 1, 'min_samples_split': 2, 'sufficient_majority': 0.95, 'max_depth': 3}


In [11]:
#Pickle
import pickle

Orgpkl_filename = "rf_model.pkcls"
with open ( Orgpkl_filename, 'wb') as f:
    pickle.dump(rf, f)

In [12]:
# Load from file
with open(Orgpkl_filename, 'rb') as file:
    pickle_model = pickle.load(file)

In [13]:
import numpy as np

y_actual = data.Y
classifier = learner(data)
y_predictions = np.array([classifier(data)])


res = Orange.evaluation.testing.Results(data, actual = y_actual, 
                                        predicted = y_predictions)
print ("Accuracy :", Orange.evaluation.scoring.CA(res))
print ("AUC:", Orange.evaluation.scoring.AUC(res))

Accuracy : [0.98876404]
AUC: [0.99175376]


In [55]:
#évaluer correctement la qualité de prédicteur
data_test, data_train = Orange.evaluation.testing.sample(data, n=0.3)
data_train.save('wine_train.tab')
data_test.save('wine_test.tab')

In [66]:
# Diviser l'ensemble de données en ensembles de test et d'entraînement
d_train = Orange.data.Table("wine_train.tab")
d_test = Orange.data.Table("wine_test.tab")

lr = Orange.classification.LogisticRegressionLearner()
rf = Orange.classification.RandomForestLearner(n_estimators=100)
tree = Orange.classification.tree.TreeLearner(max_depth=3)

lr_classifier = lr(d_train)
#prédiction en test
lr_pred = lr_classifier(d_test)
#taux de reconnaissance - concordance obsv. vs. préd.
print("logistic regression:")
print("Accuracy:", np.mean(d_test.Y == lr_pred))

rf_classifier = rf(d_train)
#prédiction en test
rf_pred = rf_classifier(d_test)
#taux de reconnaissance - concordance obsv. vs. préd.
print("Random Forest:")
print("Accuracy:", np.mean(d_test.Y == rf_pred))

tree_classifier = tree(d_train)
#prédiction en test
tree_pred = tree_classifier(d_test)
#taux de reconnaissance - concordance obsv. vs. préd.
print("Tree:")
print("Accuracy:", np.mean(d_test.Y == tree_pred))

logistic regression:
Accuracy: 0.9433962264150944
Random Forest:
Accuracy: 0.9811320754716981
Tree:
Accuracy: 0.9622641509433962


## Utilisation de la bibliothèque scikit-learn

##### La bibliothèque Orange repose sur l’utilisation d’une autre bibliothèque réputée en apprentissage automatique : scikit-learn. Vous allez reproduire, en utilisant non plus la bibliothèque Orange, mais uniquement scikit-learn, le test de performance que vous aviez réalisé sur le jeu de données heart-disease. Vous aurez besoin de charger le fichier heart-disease.tab ; nous vous proposons de le faire à l’aide de la bibliothèque pandas

In [14]:
import pandas as pd
import sklearn

def orange_loadtab(path):
    """ usage: X, y = orange_loadtab(path)"""
    data = pd.read_table(path)
    dtypes = data.values [0,:]
    data = data.drop (index =0)
    data = data.dropna()
    
    for attribute, dtype in zip( data.columns, dtypes):
        if dtype == 'continuous':
            data = data.astype ({ attribute:np.float })
        else :
            encoder = sklearn.preprocessing.LabelEncoder()
            data[attribute] = encoder.fit_transform(data[attribute])
    return data.values [:,:-1], data.values [:,-1]

In [15]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from numpy import mean

X, y = orange_loadtab("heart-disease.tab")

#TEST
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)

In [19]:
## SVM
from sklearn import svm
## les paramètres de classifieur SVM de sorte qu’il 
# utilise d’astuce du noyau

clf = svm.SVC(kernel='rbf', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
print("SVM Accuracy:",scores.mean())

SVM Accuracy: 0.6736723163841807


In [98]:
#AdaBoost
from sklearn.ensemble import AdaBoostClassifier
abc = AdaBoostClassifier(n_estimators=50, learning_rate=1)

model = AdaBoostClassifier()
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', 
                           cv=cv, n_jobs=-1, error_score='raise')
print('AdaBoost Accuracy: ',mean(n_scores))

AdaBoost Accuracy:  0.7936911487758948


In [100]:
#réseaux de neurones artificiels
from sklearn.neural_network import MLPClassifier

RNA = MLPClassifier(solver='lbfgs', alpha=1e-5,
                    hidden_layer_sizes=(5, 2), random_state=1)
scores = cross_val_score(RNA, X, y, cv=5)
print("RNA Accuracy:",scores.mean())

RNA Accuracy: 0.6257627118644067


In [16]:
# arbre de décision
from sklearn.tree import DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X, y)
scores = cross_val_score(dt, X, y, cv=5)
print("Arbre de décision Accuracy:",scores.mean())

Arbre de décision Accuracy: 0.7506779661016949


In [17]:
# forêts d’arbres décisionnels
from sklearn.ensemble import RandomForestClassifier

rf = DecisionTreeClassifier()
rf.fit(X, y)
scores = cross_val_score(rf, X, y, cv=5)
print("forêts d’arbres Accuracy:",scores.mean())

forêts d’arbres Accuracy: 0.7370056497175141


In [20]:
## les paramètres de classifieur SVM de sorte qu’il 
# n’utilise pas d’astuce du noyau
from sklearn import svm

clf = svm.SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(clf, X, y, cv=5)
print("SVM Accuracy:",scores.mean())

SVM Accuracy: 0.8382485875706216
