In [30]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix
import matplotlib.pyplot as plt


# Read in the data
artists = pd.read_csv("artists.csv", sep=';')
user_behavior = pd.read_csv("user_behavior.csv", delimiter=';')
users = pd.read_csv('users.csv', delimiter=';')

#Clean Up Column Names
users.rename(columns={
    'uid': 'UserId',
    'p': 'Premium',
    'm1': 'Minutes1',
    'm2': 'Minutes2',
    'm3': 'Minutes3' 
}, inplace=True)

user_behavior.rename(columns={
    'user_id': 'UserId',
    'song_id': 'SongId',
    'num_clicks': 'NumClicks',
    'ml': 'MinutesListened',
    'g': 'Genre',
    'f': 'Favorite',
    'mod': 'ModifiedAt', 
    'artists': 'Artists'
}, inplace=True)

artists.rename(columns={
    'artist_id': "ArtistId",
    'genre': "Genre",
    'featured': "Featured",
    'monthly_listeners': "MonthListeners"
}, inplace=True)


#unify labels
users['Premium'] = users['Premium'].map({'0': False, 
                                         '1': True,
                                         'Yes': True,
                                         'No': False},)

user_behavior['Genre'] = user_behavior['Genre'].astype('category')
user_behavior['Favorite'] = user_behavior['Favorite'].astype('bool')

artists["Genre"] = artists["Genre"].astype("category")
artists["Featured"] = artists["Featured"].astype("category")

#filla NA values with average
users['Minutes2'] = users['Minutes2'].fillna((users['Minutes1'] + users['Minutes3'])/2)

user_behavior['Genre'] = user_behavior['Genre'].map({
    'Electronic': 'Electronic',
    'Rock': 'Rock',
    'Hip-Hop': 'Hip-Hop',
    'Pop': 'Pop'
}).fillna('Other').astype('category')

user_behavior['ModifiedAt'] = user_behavior['ModifiedAt'].astype('datetime64[ns]')
user_behavior['Weekday'] = user_behavior['ModifiedAt'].dt.day_name()
user_behavior['Year'] = user_behavior['ModifiedAt'].dt.year
user_behavior['Month'] = user_behavior['ModifiedAt'].dt.month   
user_behavior['Day'] = user_behavior['ModifiedAt'].dt.day


#merge Dataframes 
users_with_behavior = users.merge(user_behavior)
artists_with_behavior = artists.merge(user_behavior, left_on="ArtistId", right_on="Artists")

#renamen duplicate columns
artists_with_behavior = artists_with_behavior.rename(columns={
    "Genre_x": "GenreSong",
    "Genre_y": "GenreArtist"
})

# Identifiziere Spalten mit Typ 'category'
non_numeric_cols = artists_with_behavior.select_dtypes(include=['category', 'object']).columns

# Apply one-hot encoding to non-numeric columns
artists_with_behavior = pd.get_dummies(artists_with_behavior, columns=non_numeric_cols, drop_first=True)

# Wandle bool-Spalten in numerische Werte (0 und 1)
bool_cols = artists_with_behavior.select_dtypes(include=['bool']).columns
artists_with_behavior[bool_cols] = artists_with_behavior[bool_cols].astype(int)

#artists_with_behavior.drop('ModifiedAt', axis=1, inplace=True)

artists_with_behavior.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1451 entries, 0 to 1450
Data columns (total 16 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   ArtistId         1451 non-null   int64         
 1   GenreSong        1451 non-null   category      
 2   Featured         1451 non-null   category      
 3   MonthListeners   1451 non-null   int64         
 4   UserId           1451 non-null   int64         
 5   SongId           1451 non-null   int64         
 6   NumClicks        1451 non-null   int64         
 7   MinutesListened  1451 non-null   float64       
 8   GenreArtist      1451 non-null   category      
 9   Favorite         1451 non-null   int64         
 10  ModifiedAt       1451 non-null   datetime64[ns]
 11  Artists          1451 non-null   int64         
 12  Weekday          1451 non-null   object        
 13  Year             1451 non-null   int32         
 14  Month            1451 non-null   int32  

In [29]:
# Define features and target
X = artists_with_behavior.drop(columns=['Featured'])
y = artists_with_behavior['Featured']

# Ensure all features and target are numeric
X = X.apply(pd.to_numeric, errors='coerce')
y = y.astype('int')

# Train-test split
train_df, test_df = train_test_split(artists_with_behavior, test_size=0.20, stratify=artists_with_behavior['Featured'], random_state=2023+2024)

# View proportions of Featured
print(f"Proportions Train:\n {train_df['Featured'].value_counts(normalize=True)}")
print(f"Proportions Test:\n {test_df['Featured'].value_counts(normalize=True)}")

# Define Model
train_model = RandomForestClassifier(n_estimators=1000, max_features=3,class_weight='balanced', random_state=0)

# Cross-validation
cv_fits_accuracy = cross_val_score(train_model, X, y, cv=4, scoring='accuracy')
cv_fits_precision = cross_val_score(train_model, X, y, cv=4, scoring='precision')
cv_fits_recall = cross_val_score(train_model, X, y, cv=4, scoring='recall')

print("\nCV-Accuracy:", np.mean(cv_fits_accuracy))
print("CV-Precision:", np.mean(cv_fits_precision))
print("CV-Recall:", np.mean(cv_fits_recall))

# Train the final model
train_model.fit(train_df.drop(columns=['Featured']), train_df['Featured'])

# Apply on test set
test_predictions = train_model.predict(test_df.drop(columns=['Featured']))
test_probabilities = train_model.predict_proba(test_df.drop(columns=['Featured']))

# Confusion Matrix
conf_matrix = confusion_matrix(test_df['Featured'], test_predictions)
print("\nConfusion Matrix:")
print(conf_matrix)

# Precision, accuracy, recall
print("\nTest-Precision:", precision_score(test_df['Featured'], test_predictions))
print("Test-Accuracy:", accuracy_score(test_df['Featured'], test_predictions))
print("Test-Recall:", recall_score(test_df['Featured'], test_predictions))

# Calculate balanced accuracy
balanced_acc = metrics.balanced_accuracy_score(test_df['Featured'], test_predictions)
print("\nBalanced Accuracy:", balanced_acc)


Proportions Train:
 Featured
0    0.902586
1    0.097414
Name: proportion, dtype: float64
Proportions Test:
 Featured
0    0.90378
1    0.09622
Name: proportion, dtype: float64


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))



CV-Accuracy: 0.9007598587583521
CV-Precision: 0.0
CV-Recall: 0.0


DTypePromotionError: The DType <class 'numpy.dtypes.DateTime64DType'> could not be promoted by <class 'numpy.dtypes.Float64DType'>. This means that no common DType exists for the given inputs. For example they cannot be stored in a single array unless the dtype is `object`. The full list of DTypes is: (<class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Float64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.DateTime64DType'>, <class 'numpy.dtypes.Int64DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.Int32DType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>, <class 'numpy.dtypes.BoolDType'>)