## Prepare packages

In [1]:
import numpy as np
import pandas as pd

In [2]:
# Disable annoying warnings in some packages
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from tpot import TPOTClassifier

## Read data

In [5]:
data = pd.read_csv('dataset_aggr.csv')

In [6]:
data['label'] = 1

In [7]:
dataset = data.copy()

for idx, row in data.iterrows():
    # new_row = {'UserID': row['UserID'], }
    candidate = data[(data.ItemID != row['ItemID']) & (data.UserID != row['UserID'])].sample(n=1)
    candidate = candidate.iloc[0].to_dict()
    
    left_k = ['UserID', 'number_of_unique_songs',
       'number_of_unique_genres', 'genre_ratio', 'main_genre_dominance',
       'no_stimulus_points', 'stimulus_points',
       'driving_style_relaxed_driving', 'driving_style_sport_driving',
       'landscape_coast_line', 'landscape_country_side', 'landscape_mountains',
       'landscape_urban', 'mood_active', 'mood_happy', 'mood_lazy', 'mood_sad',
       'natural_phenomena_afternoon', 'natural_phenomena_day_time',
       'natural_phenomena_morning', 'natural_phenomena_night',
       'road_type_city', 'road_type_highway', 'road_type_serpentine',
       'sleepiness_awake', 'sleepiness_sleepy', 'traffic_conditions_free_road',
       'traffic_conditions_lots_of_cars', 'traffic_conditions_traffic_jam',
       'weather_cloudy', 'weather_rainy', 'weather_snowing', 'weather_sunny', 
       'dominant_genre_blues',
       'dominant_genre_pop', 'dominant_genre_rock', 'second_dominant_blues',
       'second_dominant_blues_classical_disco',
       'second_dominant_blues_classicalsecond_dominant_hh',
       'second_dominant_blues_disco_rock', 'second_dominant_blues_hh',
       'second_dominant_blues_metal_reggae', 'second_dominant_classical',
       'second_dominant_classical_country',
       'second_dominant_classical_country_disco_hh',
       'second_dominant_classical_country_disco_hh_jazz_metal_rock',
       'second_dominant_classical_disco',
       'second_dominant_classical_disco_reggae',
       'second_dominant_classical_hh_rock', 'second_dominant_country',
       'second_dominant_country_disco_rock',
       'second_dominant_country_jazz_rock', 'second_dominant_disco',
       'second_dominant_disco_hh', 'second_dominant_jazz',
       'second_dominant_metal']
    
    right_k = ['ItemID', 'category_name_blues', 'category_name_classical',
       'category_name_country', 'category_name_disco', 'category_name_hip_hop',
       'category_name_jazz', 'category_name_metal', 'category_name_pop',
       'category_name_reggae', 'category_name_rock']
    
    left = { k: row[k] for k in left_k }
    right = { k: candidate[k] for k in right_k }
    
    new_row = {**left, **right}
    new_row['avg_rating'] = None
    new_row['label'] = 0
    
    dataset = dataset.append(new_row, ignore_index=True)

In [8]:
dataset.describe()

Unnamed: 0,UserID,ItemID,avg_rating,number_of_unique_songs,number_of_unique_genres,genre_ratio,main_genre_dominance,no_stimulus_points,stimulus_points,driving_style_relaxed_driving,...,second_dominant_classical_disco_reggae,second_dominant_classical_hh_rock,second_dominant_country,second_dominant_country_disco_rock,second_dominant_country_jazz_rock,second_dominant_disco,second_dominant_disco_hh,second_dominant_jazz,second_dominant_metal,label
count,1860.0,1860.0,930.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,...,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0,1860.0
mean,1019.14086,563.677957,2.373441,64.984946,8.132258,0.394187,0.589208,0.986022,1.088172,0.194624,...,0.016129,0.005376,0.016129,0.010753,0.010753,0.682796,0.012903,0.016129,0.005376,0.5
std,11.319058,214.632237,1.2759,48.312411,2.640507,0.271442,0.102053,0.868316,0.862069,0.404085,...,0.126006,0.073146,0.126006,0.103164,0.103164,0.465513,0.112887,0.126006,0.073146,0.500134
min,1001.0,248.0,0.0,1.0,1.0,0.010101,0.2,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,1009.0,281.0,1.0,20.0,6.0,0.08,0.542857,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,1019.0,695.0,2.25,70.0,10.0,0.542857,0.567308,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.5
75%,1032.0,731.0,3.25,116.0,10.0,0.6,0.637931,1.0,2.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0
max,1042.0,762.0,5.0,139.0,10.0,1.0,1.0,5.0,5.0,2.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## Prepare data

In [9]:
X = dataset.drop(['UserID', 'ItemID', 'label', 'avg_rating'], axis=1).values
Y = dataset.label.values

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.20, random_state=42, stratify=Y)

## Build AutoML solution

In [12]:
tpot = TPOTClassifier(generations=8, population_size=30, verbosity=2, 
                      n_jobs=4,
                      scoring="f1")
tpot.fit(X_train, y_train)

HBox(children=(IntProgress(value=0, description='Optimization Progress', max=270, style=ProgressStyle(descript…

Generation 1 - Current best internal CV score: 0.7027891349626987
Generation 2 - Current best internal CV score: 0.7202282836487466
Generation 3 - Current best internal CV score: 0.7436357177717942
Generation 4 - Current best internal CV score: 0.7436357177717942
Generation 5 - Current best internal CV score: 0.7436357177717942
Generation 6 - Current best internal CV score: 0.7776706092383343
Generation 7 - Current best internal CV score: 0.7778464760735492
Generation 8 - Current best internal CV score: 0.7778464760735492

Best pipeline: GradientBoostingClassifier(PolynomialFeatures(input_matrix, degree=2, include_bias=False, interaction_only=False), learning_rate=0.01, max_depth=3, max_features=1.0, min_samples_leaf=20, min_samples_split=8, n_estimators=100, subsample=0.8)


TPOTClassifier(config_dict=None, crossover_rate=0.1, cv=5,
               disable_update_check=False, early_stop=None, generations=8,
               max_eval_time_mins=5, max_time_mins=None, memory=None,
               mutation_rate=0.9, n_jobs=4, offspring_size=None,
               periodic_checkpoint_folder=None, population_size=30,
               random_state=None, scoring='f1', subsample=1.0, template=None,
               use_dask=False, verbosity=2, warm_start=False)

In [15]:
f'Best F1-score found: {tpot.score(X_test, y_test)}'

'Best F1-score found: 0.7956043956043957'

In [19]:
tpot.export('tpot_car_music.py')