In [1]:
import tpot



In [2]:
import pandas as pd
import numpy as np
from tpot import TPOTClassifier


df_test=pd.read_csv("TestSet_klassification.csv")
df_train=pd.read_csv("TrainSet_klassification.csv")

X_train = df_train.drop(["income"], axis=1).values
y_train = df_train["income"].values

X_test = df_test.drop(["income"], axis=1).values
y_test = df_test["income"].values

In [48]:
#initializing tpot with parameters
#max_time_mins=None (per default) - here max time 4h
tpot = TPOTClassifier(generations=5, population_size=50,
                     cv=5, verbosity=2, n_jobs=16, max_time_mins=240,
                     periodic_checkpoint_folder='/content/results/classPreprocessed')

In [49]:
#starting the training
tpot.fit(X_train, y_train)

Version 0.12.1 of tpot is outdated. Version 0.12.2 was released Friday February 23, 2024.


Optimization Progress:   0%|          | 0/50 [00:00<?, ?pipeline/s]




Generation 1 - Current best internal CV score: 0.8737423566519708

Generation 2 - Current best internal CV score: 0.8737423566519708

Generation 3 - Current best internal CV score: 0.8739582088339819

Generation 4 - Current best internal CV score: 0.8739582088339819

Generation 5 - Current best internal CV score: 0.8739582088339819

Best pipeline: MLPClassifier(PolynomialFeatures(GaussianNB(input_matrix), degree=2, include_bias=False, interaction_only=False), alpha=0.0001, learning_rate_init=0.01)


In [51]:
import sklearn.metrics as skm
from sklearn.metrics import roc_auc_score

#prediction
y_pred = tpot.predict(X_test)

# prediction for the AUC-ROC-Score by only using the positive classes
y_pred_proba = tpot.predict_proba(X_test)[:, 1]


print("RESULTS OF BEST MODEL:\n")

print(f"F1-Score:                   {skm.f1_score(y_test, y_pred)}")
print(f"AUC-ROC Score:              {roc_auc_score(y_test, y_pred_proba)}")
print(f"Accuracy:                   {skm.accuracy_score(y_test, y_pred)}")
print(f"Precision:                  {skm.precision_score(y_test, y_pred)}")
print(f"Recall:                     {skm.recall_score(y_test, y_pred)}")

RESULTS OF BEST MODEL:

F1-Score:                   0.5875
AUC-ROC Score:              0.8579537473504381
Accuracy:                   0.8053456221198156
Precision:                  0.5133105802047782
Recall:                     0.6867579908675799


In [35]:
#from sklearn.svm import SVC

# SVC with a linear kernel and probability=True
#clf = SVC(kernel='linear', probability=True)

#clf.fit(X_train, y_train)

#res = clf.predict_proba(X_test)

In [53]:
# output of values to dictionary
metrics_dict = {
    'Metric': [
        'F1-Score',
        'AUC-ROC Score',
        'Accuracy',
        'Precision',
        'Recall',
    ],
    'Value': [
        skm.f1_score(y_test, y_pred),
        roc_auc_score(y_test, y_pred_proba),
        skm.accuracy_score(y_test, y_pred),
        skm.precision_score(y_test, y_pred),
        skm.recall_score(y_test, y_pred)
    ]
}

#to df
metrics_df = pd.DataFrame(metrics_dict)


print(metrics_df)

metrics_df.to_csv('tpot_classification_preprosessing_metrics.csv', index=False)


          Metric     Value
0       F1-Score  0.587500
1  AUC-ROC Score  0.857954
2       Accuracy  0.805346
3      Precision  0.513311
4         Recall  0.686758


In [54]:
from IPython.display import FileLink
FileLink('tpot_classification_preprosessing_metrics.csv')

In [55]:
#showing best models as there is no leaderboard
import pandas as pd

# Converting all evaluated models to a list of tuples
my_dict = list(tpot.evaluated_individuals_.items())

# empty list for dictionaries
model_list = []

for model in my_dict:
    model_name = model[0]
    model_info = model[1]
    cv_score = model_info.get('internal_cv_score')  # Pull out cv_score as a column (i.e., sortable)
    
    # Append dictionary to the list
    model_list.append({
        'model': model_name,
        'cv_score': cv_score,
        'model_info': model_info,
    })

# Convert list to a DataFrame
model_scores = pd.DataFrame(model_list)

# Sort the DataFrame by cv_score
model_scores = model_scores.sort_values('cv_score', ascending=False)

# output
model_scores

Unnamed: 0,model,cv_score,model_info
174,MLPClassifier(PolynomialFeatures(GaussianNB(in...,0.873958,"{'generation': 3, 'mutation_count': 1, 'crosso..."
47,"LinearSVC(ExtraTreesClassifier(input_matrix, E...",0.873742,"{'generation': 1, 'mutation_count': 1, 'crosso..."
247,"XGBClassifier(input_matrix, XGBClassifier__lea...",0.873699,"{'generation': 5, 'mutation_count': 3, 'crosso..."
240,"XGBClassifier(PolynomialFeatures(input_matrix,...",0.873613,"{'generation': 5, 'mutation_count': 4, 'crosso..."
224,"XGBClassifier(input_matrix, XGBClassifier__lea...",0.873613,"{'generation': 4, 'mutation_count': 2, 'crosso..."
...,...,...,...
81,"GradientBoostingClassifier(input_matrix, Gradi...",0.565459,"{'generation': 1, 'mutation_count': 1, 'crosso..."
271,"XGBClassifier(input_matrix, XGBClassifier__lea...",0.565459,"{'generation': 5, 'mutation_count': 3, 'crosso..."
200,"XGBClassifier(input_matrix, XGBClassifier__lea...",0.565459,"{'generation': 4, 'mutation_count': 2, 'crosso..."
173,"XGBClassifier(ZeroCount(input_matrix), XGBClas...",0.565459,"{'generation': 3, 'mutation_count': 2, 'crosso..."


In [56]:
model_scores.to_csv('TPOT_preprocessed_classification_model_scores.csv', index=False)


In [57]:
from IPython.display import FileLink
FileLink('TPOT_preprocessed_classification_model_scores.csv')

In [58]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

In [47]:
from sklearn.metrics import roc_curve, auc, roc_auc_score

y_pred = tpot.predict(X_test)

# Predicting the probabilities for the positive class
y_pred_proba = tpot.predict_proba(X_test)[:, 1]

# Calculating ROC curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_proba)
roc_auc = auc(fpr, tpr)

# Plotting the ROC curve using Matplotlib with Seaborn styling
plt.figure(figsize=(10, 6))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})', lw=2)
plt.plot([0, 1], [0, 1], 'k--', label='Chance', lw=2)  # Diagonal line for random guessing
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) TPOT Classification Preprocessed')
plt.legend(loc="lower right")

# Save the plot as a PNG file
plt.savefig('tpot_roc_curve_preprocessed.png', dpi=300)
plt.show()

AttributeError: The fitted pipeline does not have the predict_proba() function.