In [1]:
import pandas as pd
import numpy as np

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# Dacon Baseline

In [None]:
!pip install surprise
from surprise import SVD, Dataset, Reader, accuracy

In [4]:
train = pd.read_csv('/content/drive/MyDrive/weit/Book/train.csv')


In [5]:
  test = pd.read_csv('/content/drive/MyDrive/weit/Book/test.csv')

In [None]:
train.columns

Index(['ID', 'User-ID', 'Book-ID', 'Book-Rating', 'Age', 'Location',
       'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher'],
      dtype='object')

In [None]:
reader = Reader(rating_scale=(0, 10))
trainSVD = Dataset.load_from_df(train[['User-ID', 'Book-ID', 'Book-Rating']], reader)
trainSVD = trainSVD.build_full_trainset()

In [None]:
model = SVD()
model.fit(trainSVD)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7835f6543df0>

In [None]:
submit = pd.read_csv('/content/sample_submission.csv')

In [None]:
submit['Book-Rating'] = test.apply(lambda row: model.predict(row['User-ID'], row['Book-ID']).est, axis=1)
submit.to_csv('./baseline_submit.csv', index=False)

### Using Catboost Regression

In [None]:
train.shape

(871393, 10)

In [6]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
kf = KFold(n_splits = 5, random_state = 2023, shuffle = True)

In [7]:
from sklearn.preprocessing import LabelEncoder

user_id_encoder = LabelEncoder()
book_id_encoder = LabelEncoder()

train['User-ID'] = user_id_encoder.fit_transform(train['User-ID'])
train['Book-ID'] = book_id_encoder.fit_transform(train['Book-ID'])

In [8]:
age_le = LabelEncoder()
train['Age'] = age_le.fit_transform(train['Age'])

In [9]:
user_id_mapping = dict(zip(user_id_encoder.classes_, user_id_encoder.transform(user_id_encoder.classes_)))
book_id_mapping = dict(zip(book_id_encoder.classes_, book_id_encoder.transform(book_id_encoder.classes_)))

test['User-ID'] = test['User-ID'].map(lambda x: user_id_mapping.get(x, -1))
test['Book-ID'] = test['Book-ID'].map(lambda x: book_id_mapping.get(x, -1))
test['Age'] = test['Age'].apply(lambda x: age_le.transform([x])[0] if x in age_le.classes_ else -1)

In [10]:
year_le = LabelEncoder()
train['Year-Of-Publication'] = year_le.fit_transform(train['Year-Of-Publication'])

test['Year-Of-Publication'] = test['Year-Of-Publication'].apply(lambda x: year_le.transform([x])[0] if x in year_le.classes_ else -1)

In [11]:
author_encoder = LabelEncoder()
publisher_encoder = LabelEncoder()

train['Book-Author'] = author_encoder.fit_transform(train['Book-Author'])
train['Publisher'] = publisher_encoder.fit_transform(train['Publisher'])

author_mapping = dict(zip(author_encoder.classes_, author_encoder.transform(author_encoder.classes_)))
publisher_mapping = dict(zip(publisher_encoder.classes_, publisher_encoder.transform(publisher_encoder.classes_)))

test['Book-Author'] = test['Book-Author'].map(lambda x: author_mapping.get(x, -1))
test['Publisher'] = test['Publisher'].map(lambda x: publisher_mapping.get(x, -1))

In [13]:
location_encoder = LabelEncoder()
train['Location'] = location_encoder.fit_transform(train['Location'])
location_mapping = dict(zip(location_encoder.classes_, location_encoder.transform(location_encoder.classes_)))
test['Location'] = test['Location'].map(lambda x: publisher_mapping.get(x, -1))

In [15]:
!pip install catboost
from catboost import CatBoostRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

Collecting catboost
  Downloading catboost-1.2.2-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2.2


In [16]:
X_train, X_val, y_train, y_val = train_test_split(
    train.drop(['ID', 'Book-Rating', 'Book-Title'], axis=1),
    train['Book-Rating'],
    test_size=0.2,
    random_state=42
)

In [17]:
features = train.drop(['ID', 'Book-Rating', 'Book-Title'], axis=1)
target = train['Book-Rating']

In [18]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [19]:
cat_features = ['Age', 'User-ID', 'Year-Of-Publication','Location','Book-Author','Book-ID','Publisher']

In [20]:
model = CatBoostRegressor(iterations=10000, learning_rate=0.1, loss_function='RMSE', cat_features=cat_features)
model.fit(X_train, y_train, eval_set=(X_test, y_test), early_stopping_rounds=100, verbose=100)


0:	learn: 3.7728005	test: 3.7689051	best: 3.7689051 (0)	total: 2.82s	remaining: 7h 49m 37s
100:	learn: 3.3460765	test: 3.2875016	best: 3.2875016 (100)	total: 1m 14s	remaining: 2h 1m 46s
200:	learn: 3.3369985	test: 3.2797585	best: 3.2797585 (200)	total: 2m 49s	remaining: 2h 17m 32s
300:	learn: 3.3323621	test: 3.2767701	best: 3.2767701 (300)	total: 4m 42s	remaining: 2h 31m 50s
400:	learn: 3.3289977	test: 3.2752457	best: 3.2752457 (400)	total: 6m 22s	remaining: 2h 32m 44s
500:	learn: 3.3266124	test: 3.2744129	best: 3.2744129 (500)	total: 8m	remaining: 2h 31m 51s
600:	learn: 3.3241929	test: 3.2737621	best: 3.2737584 (596)	total: 9m 38s	remaining: 2h 30m 39s
700:	learn: 3.3222844	test: 3.2733056	best: 3.2733056 (700)	total: 11m 12s	remaining: 2h 28m 41s
800:	learn: 3.3203516	test: 3.2729191	best: 3.2729191 (800)	total: 12m 51s	remaining: 2h 27m 42s
900:	learn: 3.3184748	test: 3.2725849	best: 3.2725849 (900)	total: 14m 35s	remaining: 2h 27m 18s
1000:	learn: 3.3166068	test: 3.2722775	best: 3.

<catboost.core.CatBoostRegressor at 0x7f8ef32dabc0>

In [21]:
pred = model.predict(X_test)
mse = mean_squared_error(y_test, pred)
rmse = np.sqrt(mse)
print(f'RMSE: {rmse:.4f}')

feature_importances = model.get_feature_importance()
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f"{name}: {score:.2f}")

RMSE: 3.2710
User-ID: 50.42
Book-Author: 14.58
Publisher: 11.47
Location: 6.95
Year-Of-Publication: 6.71
Age: 6.36
Book-ID: 3.52


In [22]:
X_test_final = test.drop(['ID', 'Book-Title'], axis=1)

# 테스트 데이터에 대한 예측
y_test_pred_final = model.predict(X_test_final)

# 예측 결과의 길이 확인
print("Length of y_test_pred_final: ", len(y_test_pred_final))

Length of y_test_pred_final:  159621


In [23]:
# 제출 파일 생성
submit = pd.read_csv("/content/drive/MyDrive/weit/Book/sample_submission.csv")
submit["Book-Rating"] = y_test_pred_final

# 제출 파일 저장
submit.to_csv("catboost5_submission.csv", index=False)