In [19]:
import os
import gc
import glob
os.environ["TF_GPU_ALLOCATOR"]="cuda_malloc_async"

import nvtabular as nvt
from nvtabular.ops import *
import numpy as np

from merlin.models.utils.example_utils import workflow_fit_transform
from merlin.schema.tags import Tags
from merlin.models.utils.dataset import unique_rows_by_features

import merlin.models.tf as mm
from merlin.io.dataset import Dataset
import tensorflow as tf

In [20]:
INPUT_DATA_DIR = os.environ.get("INPUT_DATA_DIR", "/root/Data/Row/")
DATA_FOLDER = os.environ.get("DATA_FOLDER", "/root/Data/")
MODELS_FOLDER = os.environ.get("MODELS", "/root/Models/")
PROCESSED_FOLDER = os.environ.get("PROCESSED_FOLDER", "/root/Data/Processed/")
feature_repo_path = os.environ.get("FEAST_PATH", "/root/Data/feast_repo/feature_repo")

BATCH_SIZE = int(os.environ.get("BATCH_SIZE", 512))
from merlin.core.dispatch import get_lib
df_lib = get_lib()
df_lib

<module 'cudf' from '/usr/local/lib/python3.10/dist-packages/cudf/__init__.py'>

import pandas as pd

train_processed = pd.read_csv('/root/Data/Plot/train_numeric')
test_processed = pd.read_csv('/root/Data/Plot/test_numeric')
ytest = pd.read_csv('/root/Data/Plot/ytest_numeric')
ytrain = pd.read_csv('/root/Data/Plot/ytrain_numeric')

In [21]:
train_processed_full = Dataset(os.path.join(PROCESSED_FOLDER, "train_processed", "*.parquet")).compute()
test_processed_full = Dataset(os.path.join(PROCESSED_FOLDER, "test_processed", "*.parquet")).compute()

ytest = test_processed_full[['Target']].to_pandas()
ytrain = train_processed_full[['Target']].to_pandas()
train_processed = train_processed_full.drop('Target', axis=1).to_pandas()
test_processed = test_processed_full.drop('Target', axis=1).to_pandas()


In [33]:
train_processed_full.to_csv('train_processed.csv')

In [22]:
train_processed.columns

Index(['item_id', 'product_code', 'prod_name', 'product_type_no',
       'graphical_appearance_no', 'colour_group_code', 'product_group_name',
       'perceived_colour_value_id', 'perceived_colour_master_id',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'detail_desc', 'Time_Weighted_Purchased',
       'count_7d_purchased', 'count_30d_purchased', 'user_id', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'postal_code',
       'popular_product_type', '2nd_popular_product_type',
       'popular_department_no', '2nd_popular_department_no',
       'popular_section_no', '2nd_popular_section_no', 'last_product_code',
       '2nd_last_product_code', 'last_product_type', '2nd_last_product_type',
       'age', 'frequency', 'amount', 'recency'],
      dtype='object')

In [23]:
ytest

Unnamed: 0,Target
0,1
1,0
2,0
3,1
4,0
...,...
420582,1
420583,0
420584,0
420585,0


In [24]:
to_drop = ['item_id', 'user_id']
train_set = train_processed.drop(columns=to_drop)
test_set = test_processed.drop(columns=to_drop)
ytest = ytest #.drop(columns=['Unnamed: 0'])
ytrain = ytrain #.drop(columns=['Unnamed: 0'])

In [25]:
ytest.columns

Index(['Target'], dtype='object')

In [26]:
ytrain.value_counts()

Target
0         511039
1         470328
dtype: int64

In [27]:
train_processed['item_id'].nunique()

23417

In [28]:
print(len(train_processed_full), len(train_set), len(ytrain))

981367 981367 981367


In [29]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_jobs=-1, n_estimators=10) #, class_weight='balanced_subsample'
model.fit(train_set, ytrain)
y_pred = model.predict(test_set)
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(ytest, y_pred))
print(roc_auc_score(ytest, y_pred))

  model.fit(train_set, ytrain)


              precision    recall  f1-score   support

           0       0.76      0.87      0.81    218849
           1       0.84      0.70      0.76    201738

    accuracy                           0.79    420587
   macro avg       0.80      0.79      0.79    420587
weighted avg       0.80      0.79      0.79    420587

0.7857757868186614


In [30]:
# Calcul des métriques
precision = classification_report(ytest, y_pred, output_dict=True)['weighted avg']['precision']
recall = classification_report(ytest, y_pred, output_dict=True)['weighted avg']['recall']
auc = roc_auc_score(ytest, y_pred)
loss = None  # Vous pouvez calculer une autre métrique de perte si nécessaire

# Formatage des résultats
results = f"""
RandomForest
val_loss:{loss}
val_auc:{auc}
val_precision:{precision}
val_recall:{recall}
"""

# Ajout des résultats au fichier
with open('results.txt', 'a') as file:
    file.write(results)

In [31]:
import pandas as pd

importances = model.feature_importances_

forest_importances = pd.Series(importances, index=train_set.columns).sort_values(ascending=False)

print(forest_importances)

postal_code                   0.129420
Time_Weighted_Purchased       0.101539
count_30d_purchased           0.091878
count_7d_purchased            0.054682
recency                       0.043049
frequency                     0.042978
2nd_last_product_code         0.036101
last_product_code             0.034749
age                           0.030552
amount                        0.028955
2nd_popular_department_no     0.027406
detail_desc                   0.025974
2nd_popular_product_type      0.025858
department_no                 0.025617
popular_department_no         0.024682
prod_name                     0.024490
product_code                  0.024396
2nd_last_product_type         0.024124
last_product_type             0.024094
2nd_popular_section_no        0.022244
popular_product_type          0.021849
popular_section_no            0.018345
product_type_no               0.015061
product_group_name            0.013159
colour_group_code             0.012790
section_no               

In [32]:
forest_importances.to_csv('feature_importances.csv')

In [14]:
from sklearn.ensemble import GradientBoostingClassifier

model = GradientBoostingClassifier(random_state=0, n_estimators=40, max_depth=10) 
model.fit(train_set, ytrain)
y_pred = model.predict(test_set)
from sklearn.metrics import classification_report, roc_auc_score
print(classification_report(ytest, y_pred))
print(roc_auc_score(ytest, y_pred))

  y = column_or_1d(y, warn=True)


              precision    recall  f1-score   support

           0       0.77      0.85      0.81    218849
           1       0.81      0.72      0.76    201738

    accuracy                           0.79    420587
   macro avg       0.79      0.78      0.78    420587
weighted avg       0.79      0.79      0.79    420587

0.7843517173560879


In [16]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, classification_report, roc_auc_score
import numpy as np

# Définir les hyperparamètres à explorer
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [10, 20],
    #'min_samples_split': [5],
    #'min_samples_leaf': [1],
    #'class_weight': ['balanced', 'balanced_subsample'],
    'random_state': [0],
    #'bootstrap': [True, False]
}

# Créer un modèle RandomForestClassifier
model = GradientBoostingClassifier()

# Configurer GridSearchCV pour optimiser le rappel
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=make_scorer(recall_score),
    cv=2,  # Utiliser 3-fold cross-validation
    n_jobs=-1,
    verbose=2
)

# Entraîner le modèle avec GridSearchCV
grid_search.fit(train_set, ytrain)

# Extraire le meilleur modèle
best_model = grid_search.best_estimator_

# Prédire avec le meilleur modèle
y_pred = best_model.predict(test_set)

# Calculer les métriques
precision = classification_report(ytest, y_pred, output_dict=True)['weighted avg']['precision']
recall = classification_report(ytest, y_pred, output_dict=True)['weighted avg']['recall']
auc = roc_auc_score(ytest, y_pred)


print(classification_report(ytest, y_pred))
print("Meilleurs paramètres trouvés :", grid_search.best_params_)

Fitting 2 folds for each of 4 candidates, totalling 8 fits


  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


[CV] END ......max_depth=10, n_estimators=50, random_state=0; total time= 5.0min
[CV] END ......max_depth=10, n_estimators=50, random_state=0; total time= 5.0min
[CV] END .....max_depth=10, n_estimators=100, random_state=0; total time=10.2min
[CV] END .....max_depth=10, n_estimators=100, random_state=0; total time=10.2min
[CV] END ......max_depth=20, n_estimators=50, random_state=0; total time=20.4min
[CV] END ......max_depth=20, n_estimators=50, random_state=0; total time=20.5min


  y = column_or_1d(y, warn=True)


[CV] END .....max_depth=20, n_estimators=100, random_state=0; total time=40.5min
[CV] END .....max_depth=20, n_estimators=100, random_state=0; total time=41.1min
              precision    recall  f1-score   support

           0       0.78      0.85      0.82    218849
           1       0.82      0.74      0.78    201738

    accuracy                           0.80    420587
   macro avg       0.80      0.80      0.80    420587
weighted avg       0.80      0.80      0.80    420587

Meilleurs paramètres trouvés : {'max_depth': 10, 'n_estimators': 100, 'random_state': 0}


In [18]:
print(classification_report(ytest, y_pred, output_dict=True))
print(roc_auc_score(ytest, y_pred))

{'0': {'precision': 0.7831328842826668, 'recall': 0.8521354906807891, 'f1-score': 0.8161783531481617, 'support': 218849}, '1': {'precision': 0.8226411991998027, 'recall': 0.7440095569501036, 'f1-score': 0.7813520808551953, 'support': 201738}, 'accuracy': 0.8002720007988835, 'macro avg': {'precision': 0.8028870417412348, 'recall': 0.7980725238154464, 'f1-score': 0.7987652170016785, 'support': 420587}, 'weighted avg': {'precision': 0.8020833711849086, 'recall': 0.8002720007988835, 'f1-score': 0.7994736463459104, 'support': 420587}}
0.7980725238154462


In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score



# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_set, test_set, np.ravel(ytrain), np.ravel(ytest)


# Liste des classifieurs à tester
classifiers = [
    #("Random Forest", RandomForestClassifier(n_jobs=-1, n_estimators=10)),
    ("SVM", SVC(probability=True)),
    #("KNN", KNeighborsClassifier()),
    #("Logistic Regression", LogisticRegression())
]

# Fonction pour évaluer un modèle
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # Vérifier si le modèle a une méthode predict_proba
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    else:
        print("ROC AUC: Not available for this model")
    print("\n")

# Évaluation de chaque classifieur
for name, clf in classifiers:
    print(f"Évaluation de {name}:")
    evaluate_model(clf, X_train, X_test, y_train, y_test)

Évaluation de SVM:


In [9]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score



# Séparation des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_set, test_set, np.ravel(ytrain), np.ravel(ytest)


# Liste des classifieurs à tester
classifiers = [
    ("Random Forest", RandomForestClassifier(n_jobs=-1, n_estimators=10)),
    #("SVM", SVC(probability=True)),
    ("KNN", KNeighborsClassifier()),
    ("Logistic Regression", LogisticRegression())
]

# Fonction pour évaluer un modèle
def evaluate_model(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))
    
    # Vérifier si le modèle a une méthode predict_proba
    if hasattr(model, "predict_proba"):
        y_pred_proba = model.predict_proba(X_test)[:, 1]
        print(f"ROC AUC: {roc_auc_score(y_test, y_pred_proba):.4f}")
    else:
        print("ROC AUC: Not available for this model")
    print("\n")

# Évaluation de chaque classifieur
for name, clf in classifiers:
    print(f"Évaluation de {name}:")
    evaluate_model(clf, X_train, X_test, y_train, y_test)

Évaluation de Random Forest:
              precision    recall  f1-score   support

           0       0.76      0.87      0.81    218849
           1       0.84      0.70      0.76    201738

    accuracy                           0.79    420587
   macro avg       0.80      0.79      0.79    420587
weighted avg       0.80      0.79      0.79    420587

ROC AUC: 0.8677


Évaluation de KNN:




              precision    recall  f1-score   support

           0       0.67      0.69      0.68    218849
           1       0.65      0.63      0.64    201738

    accuracy                           0.66    420587
   macro avg       0.66      0.66      0.66    420587
weighted avg       0.66      0.66      0.66    420587

ROC AUC: 0.7147


Évaluation de Logistic Regression:


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


              precision    recall  f1-score   support

           0       0.60      0.60      0.60    218849
           1       0.56      0.56      0.56    201738

    accuracy                           0.58    420587
   macro avg       0.58      0.58      0.58    420587
weighted avg       0.58      0.58      0.58    420587

ROC AUC: 0.6145




# Grid search pour le recall

In [23]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, recall_score, classification_report, roc_auc_score
import numpy as np

# Définir les hyperparamètres à explorer
param_grid = {
    'n_estimators': [100],
    'max_depth': [30],
    'min_samples_split': [5],
    'min_samples_leaf': [1],
    'class_weight': ['balanced', 'balanced_subsample'],
    #'bootstrap': [True, False]
}

#{'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 40}

# Créer un modèle RandomForestClassifier
model = RandomForestClassifier(n_jobs=-1)

# Configurer GridSearchCV pour optimiser le rappel
grid_search = GridSearchCV(
    estimator=model,
    param_grid=param_grid,
    scoring=make_scorer(recall_score),
    cv=2,  # Utiliser 3-fold cross-validation
    n_jobs=-1,
    verbose=2
)

# Entraîner le modèle avec GridSearchCV
grid_search.fit(train_set, ytrain)

# Extraire le meilleur modèle
best_model = grid_search.best_estimator_

# Prédire avec le meilleur modèle
y_pred = best_model.predict(test_set)

# Calculer les métriques
precision = classification_report(ytest, y_pred, output_dict=True)['weighted avg']['precision']
recall = classification_report(ytest, y_pred, output_dict=True)['weighted avg']['recall']
auc = roc_auc_score(ytest, y_pred)


print(classification_report(ytest, y_pred))
print("Meilleurs paramètres trouvés :", grid_search.best_params_)

Fitting 2 folds for each of 2 candidates, totalling 4 fits


  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  self.best_estimator_.fit(X, y, **fit_params)


              precision    recall  f1-score   support

           0       0.79      0.86      0.82    218849
           1       0.83      0.75      0.79    201738

    accuracy                           0.81    420587
   macro avg       0.81      0.81      0.81    420587
weighted avg       0.81      0.81      0.81    420587

Meilleurs paramètres trouvés : {'class_weight': 'balanced_subsample', 'max_depth': 30, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}


In [24]:
print(classification_report(ytest, y_pred, output_dict=True))
print(roc_auc_score(ytest, y_pred))

{'0': {'precision': 0.788121312476218, 'recall': 0.8612330876540446, 'f1-score': 0.823056768558952, 'support': 218849}, '1': {'precision': 0.8326186644326374, 'recall': 0.7488276873965242, 'f1-score': 0.7885033953243175, 'support': 201738}, 'accuracy': 0.8073169165951396, 'macro avg': {'precision': 0.8103699884544278, 'recall': 0.8050303875252844, 'f1-score': 0.8057800819416348, 'support': 420587}, 'weighted avg': {'precision': 0.8094648318645591, 'recall': 0.8073169165951396, 'f1-score': 0.8064829600256196, 'support': 420587}}
0.8050303875252843
[CV] END class_weight=balanced, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=   8.0s
[CV] END class_weight=balanced, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  10.9s
[CV] END class_weight=balanced_subsample, max_depth=30, min_samples_leaf=1, min_samples_split=5, n_estimators=100; total time=  16.5s
[CV] END class_weight=balanced_subsample, max_depth=30, min_sampl

# Cross-validation

In [105]:
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFE
from sklearn.metrics import accuracy_score

In [12]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.inspection import permutation_importance
import numpy as np
import pandas as pd

# Supposons que X soit votre ensemble de features et y vos labels
X = pd.concat([train_set, test_set])
y = pd.concat([ytrain, ytest])

# Définir le modèle
model = RandomForestClassifier(n_jobs=-1, n_estimators=100)

# Définir la validation croisée
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Effectuer la validation croisée pour le score
cv_scores = cross_val_score(model, X, y, cv=cv, scoring='roc_auc')

print(f"Scores de validation croisée: {cv_scores}")
print(f"Score moyen: {cv_scores.mean():.3f} (+/- {cv_scores.std() * 2:.3f})")

# Calculer l'importance des features avec permutation importance
feature_importance = np.zeros(X.shape[1])

for train_idx, val_idx in cv.split(X, y):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
    
    model.fit(X_train, y_train)
    
    # Calculer l'importance des features pour cette itération
    perm_importance = permutation_importance(model, X_val, y_val, n_repeats=10, random_state=42)
    feature_importance += perm_importance.importances_mean

# Moyenner l'importance des features sur toutes les itérations
feature_importance /= cv.n_splits

# Créer un DataFrame avec les importances des features
feature_importance_df = pd.DataFrame({
    'feature': X.columns,
    'importance': feature_importance
})

# Trier les features par importance décroissante
feature_importance_df = feature_importance_df.sort_values('importance', ascending=False)

print("\nImportance des features:")
print(feature_importance_df)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


Scores de validation croisée: [0.90471229 0.90399891 0.90439405 0.90392881 0.90516731]
Score moyen: 0.904 (+/- 0.001)


  model.fit(X_train, y_train)
  model.fit(X_train, y_train)
  model.fit(X_train, y_train)
  model.fit(X_train, y_train)
  model.fit(X_train, y_train)



Importance des features:
                       feature  importance
19         count_30d_purchased    0.110228
8                  postal_code    0.078685
20          count_7d_purchased    0.063564
23                     recency    0.028575
24                   frequency    0.026329
21     Time_Weighted_Purchased    0.024660
1           product_group_name    0.007296
22                         age    0.006581
31      2nd_popular_section_no    0.005939
30          popular_section_no    0.005439
0                    prod_name    0.005427
25                      amount    0.005190
3                  detail_desc    0.005089
9                 product_code    0.004580
18            garment_group_no    0.004125
17                  section_no    0.003772
27    2nd_popular_product_type    0.003727
28       popular_department_no    0.003523
29   2nd_popular_department_no    0.003340
10             product_type_no    0.003311
15               department_no    0.003309
26        popular_product_ty

In [122]:
dataset = Dataset(os.path.join(PROCESSED_FOLDER, "dataset_numeric", "*.parquet")).compute().to_pandas()
X = dataset.drop(['Target', 'user_id', 'item_id', 'postal_code', 'recency', 'frequency', 'amount',
       '2nd_popular_product_type', 'popular_department_no',
       '2nd_popular_department_no', 'popular_section_no',
       '2nd_popular_section_no', 'last_product_code', '2nd_last_product_code',
       'last_product_type', '2nd_last_product_type', 'product_code',
       'prod_name', 'department_no', 'detail_desc', 'count_30d_purchased',
       'count_7d_purchased', 'Time_Weighted_Purchased'], axis=1)
y = dataset['Target']

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [124]:
len(X_train.columns)

16

In [127]:
# Utiliser un modèle de RandomForest pour l'exemple, mais vous pouvez utiliser un autre modèle
model = RandomForestClassifier(n_jobs=-1, n_estimators=10, random_state=42)

# Initialiser RFE avec le modèle et le nombre de caractéristiques souhaitées
# Vous pouvez ajuster le nombre de caractéristiques à sélectionner
rfe = RFE(estimator=model, n_features_to_select=4, verbose=1, step=1)

# Ajuster RFE sur les données d'entraînement
rfe.fit(X_train, y_train)

# Obtenir les caractéristiques sélectionnées
selected_features = X_train.columns[rfe.support_]
print("Selected features:", selected_features)

Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.
Fitting estimator with 5 features.
Selected features: Index(['age', 'popular_product_type', 'product_type_no', 'colour_group_code'], dtype='object')


In [128]:
# Créer un nouveau modèle avec les caractéristiques sélectionnées
model.fit(X_train[selected_features], y_train)

# Prédire sur l'ensemble de test
y_pred = model.predict(X_test[selected_features])

# Calculer la précision
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy with selected features:", accuracy)
print(classification_report(y_test, y_pred))
print(roc_auc_score(y_test, y_pred))

Accuracy with selected features: 0.7357600120688841
              precision    recall  f1-score   support

           0       0.79      0.83      0.81    437957
           1       0.61      0.56      0.58    218276

    accuracy                           0.74    656233
   macro avg       0.70      0.69      0.69    656233
weighted avg       0.73      0.74      0.73    656233

0.6906661219827275


In [144]:
correlation_matrix = dataset[['age', 'postal_code', 'Target']].corr()
print(correlation_matrix['Target'].sort_values(ascending=False))

Target         1.000000
postal_code    0.027657
age           -0.096906
Name: Target, dtype: float64


In [145]:
correlation_matrix = dataset[['age', 'Target']].corr()
print(correlation_matrix['Target'].sort_values(ascending=False))

Target    1.000000
age      -0.096906
Name: Target, dtype: float64
