In [1]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")
song = pd.read_csv("song_data.csv")

In [3]:
artist_enc = LabelEncoder()
album_enc = LabelEncoder()

In [4]:
song['artist_enc'] = artist_enc.fit_transform(song['artist_name'])
song['album_enc'] = album_enc.fit_transform(song['release'])

In [5]:
data_mrg = pd.merge(data, song, on='song_id')

In [6]:
print(data_mrg)

         user_id  song_id  rating                   title  \
0        1257279  1133436    4.25            Live In Fear   
1        1521617  1041044    2.75        Strange Behavior   
2        1757741  1018376    4.25  Disconnect_ Disconnect   
3        1311545  1035650    5.25  Busting Up a Starbucks   
4        1633733  1080634    8.25       Te Voy A Extrañar   
...          ...      ...     ...                     ...   
5460384  1259514  1052552    5.50                Umbrella   
5460385  1124783  1003187    5.00                Velouria   
5460386  1109277  1060261    7.00  House Of The Ancestors   
5460387  1296363  1049236    5.00        This Is The Line   
5460388  1358156  1055568    8.25                   Piano   

                                       release             artist_name  year  \
0                      The Anger And The Truth              The Unseen  2001   
1                                          Big               Macy Gray  2007   
2                          

In [7]:
features = ['user_id', 'song_id', 'artist_enc', 'album_enc']
X = data_mrg[features]
y = data_mrg['rating']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=50, learning_rate=0.01)
xgb_model.fit(X_train, y_train)

In [10]:
import numpy as np
y_pred = xgb_model.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred, squared=False))
print(f"XGBoost Hybrid Recommender RMSE: {rmse:.2f}")

XGBoost Hybrid Recommender RMSE: 1.31




In [11]:
test_merged = test_data.merge(song, on='song_id', how='left')
print(test_merged)

         user_id  song_id                            title  \
0        1717534  1005189           The Vaguest Of Feeling   
1        1302257  1042789        Breaking Up Is Hard To Do   
2        1700269  1042495                 Sail To The Moon   
3        1265736  1040200                Edge Of The Ocean   
4        1060963  1008334                          Beggin'   
...          ...      ...                              ...   
1091880  1145643  1000853                         Infinity   
1091881  1582452  1085950                        Look Away   
1091882  1512013  1004857                    One Last Time   
1091883  1304600  1000567  Wherever You Go (Album Version)   
1091884  1660182  1190323              The Hole Is The Law   

                             release      artist_name  year  artist_enc  \
0                              Blood  Franz Ferdinand  2009        7890   
1         Platinum & Gold Collection      Neil Sedaka  1962       15327   
2                  Hail To The

In [12]:
test_merged['artist_enc'] = artist_enc.transform(test_merged['artist_name'])
test_merged['album_enc'] = album_enc.transform(test_merged['release'])
print(test_merged)

         user_id  song_id                            title  \
0        1717534  1005189           The Vaguest Of Feeling   
1        1302257  1042789        Breaking Up Is Hard To Do   
2        1700269  1042495                 Sail To The Moon   
3        1265736  1040200                Edge Of The Ocean   
4        1060963  1008334                          Beggin'   
...          ...      ...                              ...   
1091880  1145643  1000853                         Infinity   
1091881  1582452  1085950                        Look Away   
1091882  1512013  1004857                    One Last Time   
1091883  1304600  1000567  Wherever You Go (Album Version)   
1091884  1660182  1190323              The Hole Is The Law   

                             release      artist_name  year  artist_enc  \
0                              Blood  Franz Ferdinand  2009        7890   
1         Platinum & Gold Collection      Neil Sedaka  1962       15327   
2                  Hail To The

In [13]:
X_new = test_merged[['user_id', 'song_id', 'artist_enc', 'album_enc']]

In [39]:
preds = xgb_model.predict(X_new)
test_merged['predicted_rating'] = preds
print(test_merged)
uid_sid = []
for row in test_merged.itertuples():
    uid_sid.append(str(row[1])+'-'+str(row[2]))
test_merged['user_id-song_id'] = uid_sid

         user_id  song_id                            title  \
0        1717534  1005189           The Vaguest Of Feeling   
1        1302257  1042789        Breaking Up Is Hard To Do   
2        1700269  1042495                 Sail To The Moon   
3        1265736  1040200                Edge Of The Ocean   
4        1060963  1008334                          Beggin'   
...          ...      ...                              ...   
1091880  1145643  1000853                         Infinity   
1091881  1582452  1085950                        Look Away   
1091882  1512013  1004857                    One Last Time   
1091883  1304600  1000567  Wherever You Go (Album Version)   
1091884  1660182  1190323              The Hole Is The Law   

                             release      artist_name  year  artist_enc  \
0                              Blood  Franz Ferdinand  2009        7890   
1         Platinum & Gold Collection      Neil Sedaka  1962       15327   
2                  Hail To The

In [45]:
outputxgb = test_merged[['user_id-song_id', 'predicted_rating']]
outputxgb.rename(columns={'predicted_rating': 'rating'}, inplace=True)
print(outputxgb)

         user_id-song_id    rating
0        1717534-1005189  5.371850
1        1302257-1042789  5.346443
2        1700269-1042495  5.342855
3        1265736-1040200  5.346250
4        1060963-1008334  5.371850
...                  ...       ...
1091880  1145643-1000853  5.442841
1091881  1582452-1085950  5.331007
1091882  1512013-1004857  5.372332
1091883  1304600-1000567  5.370944
1091884  1660182-1190323  5.295125

[1091885 rows x 2 columns]


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  outputxgb.rename(columns={'predicted_rating': 'rating'}, inplace=True)


In [47]:
outputxgb.to_csv("submission.csv", index=False)