In [22]:
import os
import zipfile
import shutil
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [2]:
df_list = []

# 데이터프레임 생성
for dataset in os.listdir('../dataset'):
    if dataset.endswith('.csv'):
        globals()[dataset[:-4]] = pd.read_csv(f'../dataset/{dataset}')
        print(dataset[:-4])
        df_list.append(dataset[:-4])

products
orders
order_products__train
departments
aisles
order_products__prior
sample_submission


In [3]:
orders_products = pd.concat([order_products__prior, order_products__train]).reset_index(drop=True)
orders_products

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0
...,...,...,...,...
33819101,3421063,14233,3,1
33819102,3421063,35548,4,1
33819103,3421070,35951,1,1
33819104,3421070,16953,2,1


In [4]:
len(orders[orders['eval_set']=='train'])/len(orders[orders['eval_set']=='prior'])

0.040813108072042636

In [5]:
# 재구매에 대한 예측을 하기 위해 user의 order_number가 1인 row는 드랍
orders = orders.dropna()
len(orders[orders['eval_set']=='train'])/len(orders[orders['eval_set']=='prior'])

0.04361037204208511

In [6]:
o_train = pd.merge(orders[['order_id', 'user_id', 'eval_set']], orders_products[['order_id', 'product_id', 'reordered']], on='order_id')
o_train

Unnamed: 0,order_id,user_id,eval_set,product_id,reordered
0,2398795,1,prior,196,1
1,2398795,1,prior,10258,0
2,2398795,1,prior,12427,1
3,2398795,1,prior,13176,0
4,2398795,1,prior,26088,1
...,...,...,...,...,...
31741033,272231,206209,train,40603,0
31741034,272231,206209,train,15655,0
31741035,272231,206209,train,42606,0
31741036,272231,206209,train,37966,0


In [7]:
train_df = o_train[o_train['eval_set']=='prior'].drop(columns=['order_id', 'eval_set'])
test_df = o_train[o_train['eval_set']=='train'].drop(columns=['order_id', 'eval_set'])

In [8]:
from scipy import sparse

# 유저가 아이템을 구매한 횟수를 confidence (c_ui)값로 사용하여 CSR 희소 행렬 생성
sparse_user_item = sparse.csr_matrix((train_df['reordered'].astype(float),
                                      (train_df['user_id'], train_df['product_id'])))
sparse_user_item

<206210x49689 sparse matrix of type '<class 'numpy.float64'>'
	with 12439884 stored elements in Compressed Sparse Row format>

In [11]:
import implicit

# ALS 모델 인스턴스 생성
model = implicit.als.AlternatingLeastSquares(
    factors=128,                 # 잠재 요인의 수
    regularization=1e-4,         # 정규화 파라미터
    iterations=20,             # 학습 반복 횟수
    calculate_training_loss=True,  # 학습 손실 계산 여부
    use_gpu=False               # GPU 사용 여부
)

In [12]:
model.fit(sparse_user_item)

100%|██████████| 20/20 [06:16<00:00, 18.82s/it, loss=0.000935]


In [13]:
# User Matrix와 Item Matrix의 크기 확인
user_vecs = model.user_factors  # User Matrix
item_vecs = model.item_factors  # Item Matrix

# 각 행렬의 차원(크기) 출력
print(user_vecs.shape)
print(item_vecs.shape)

(206210, 128)
(49689, 128)


In [17]:
# 정밀도와 재현율 계산 함수
def get_precision(relevant, recommend):
    _intersection = set(recommend).intersection(set(relevant))
    return len(_intersection) / len(recommend)

def get_recall(relevant, recommend):
    _intersection = set(recommend).intersection(set(relevant))
    return len(_intersection) / len(relevant)

In [14]:
%%time
# 특정 사용자(여기서는 사용자 0)에게 추천할 아이템 10개 선정 (filter_already_liked_items=False)
recommended_items, scores = model.recommend(1, sparse_user_item[1], N=10, filter_already_liked_items=True)

# 추천 결과와 점수 출력
recommended_results = [(item, score) for item, score in zip(recommended_items, scores)]
print(recommended_results)

[(37710, 0.53154504), (31651, 0.5014758), (6184, 0.4962879), (43154, 0.4077609), (41400, 0.4053325), (22802, 0.39193437), (18023, 0.37370732), (3298, 0.36634302), (45051, 0.3658828), (11759, 0.3430489)]
CPU times: user 128 ms, sys: 59.9 ms, total: 188 ms
Wall time: 68.1 ms


In [20]:
get_precision(o_train[(o_train['user_id']==1) & (o_train['eval_set']=='train')]['product_id'], recommended_items)

0.0

In [23]:
from tqdm import tqdm

# 추천할 아이템의 수를 설정합니다.
k = 11

# 정밀도, 재현율, NDCG를 저장할 리스트를 초기화합니다.
precisions, recalls, ndcgs = [], [], []

# Test 데이터셋의 각 사용자에 대해 반복합니다.
for user_id, user_df in tqdm(test_df.groupby('user_id')):
    # 해당 사용자가 구매한 아이템 목록을 추출합니다.
    buy_items = list(user_df['product_id'])

    # 추천할 아이템을 결정합니다.
    recommended_items, scores = model.recommend(user_id, sparse_user_item[user_id], N=k, filter_already_liked_items=False)
    top_k_item =recommended_items

    # 정밀도, 재현율, NDCG를 계산하여 리스트에 추가합니다.
    precisions.append(get_precision(buy_items, top_k_item))
    recalls.append(get_recall(buy_items, top_k_item))

# 각 지표의 평균 값을 출력합니다.
print(f'\nprecision@k: {np.mean(precisions)}')
print(f'recall@k: {np.mean(recalls)}')


100%|██████████| 131209/131209 [04:35<00:00, 475.95it/s]



precision@k: 0.15668756092812372
recall@k: 0.1941663110636375


In [25]:
F1 = 2 * (np.mean(precisions) * np.mean(recalls)) / (np.mean(precisions) + np.mean(recalls))
F1

0.17342516713446518

In [27]:
for user_id, user_df in tqdm(test_df.groupby('user_id')):
    

array([13176, 24852, 43961, 19348, 36011, 10957,  5479, 25837, 28156,
       46676, 23909], dtype=int32)

In [31]:
user_df

Unnamed: 0,user_id,product_id,reordered
31741030,206209,6846,1
31741031,206209,9405,1
31741032,206209,24852,1
31741033,206209,40603,0
31741034,206209,15655,0
31741035,206209,42606,0
31741036,206209,37966,0
31741037,206209,39216,1


In [None]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, accuracy_score
import pandas as pd

# Load your dataset
# For this example, let's assume you have a CSV file
data = pd.read_csv('your_dataset.csv')

# Specify the feature columns and the target column
features = data.drop(columns=['target'])  # Replace 'target' with your actual target column name
target = data['target']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create a LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train)
test_data = lgb.Dataset(X_test, label=y_test, reference=train_data)

# Set the parameters for LightGBM
params = {
    'boosting_type': 'gbdt',
    'objective': 'regression',  # Change to 'binary' or 'multiclass' for classification
    'metric': 'rmse',           # Change to 'binary_logloss' or 'multi_logloss' for classification
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# Train the model
num_boost_round = 100
bst = lgb.train(params, train_data, num_boost_round, valid_sets=[test_data], early_stopping_rounds=10)

# Make predictions
y_pred = bst.predict(X_test, num_iteration=bst.best_iteration)

# Evaluate the model
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse}')

Collecting lightgbm
  Using cached lightgbm-4.3.0.tar.gz (1.7 MB)
  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Installing backend dependencies ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: lightgbm
  Building wheel for lightgbm (pyproject.toml) ... [?25l-^C
[?25canceled
