In [9]:

import argparse
import copy
import os
from enum import Enum
from typing import Dict, List, Union

import numpy as np
import pandas as pd
import torch
from config.configuration import Config
from data.dataloader import GeneralTrainerDataLoader
from data.dataset import GeneralDataset
from data.interaction import Interaction
from data.utils import data_reparation
from models import NeuMF
from root import DATASET_DIR, ROOT_DIR, absolute
from torch.nn.utils import rnn as rnn_utils
from torch.utils.data import Dataset as TorchDataset
from trainer import Trainer
from utils.logger import init_logger
import hashlib
from FlagEmbedding import FlagModel


### 抓取日志中的信息

In [10]:
import os
import re

# Define the path to the directory (this is an example and needs to be adjusted to the actual path)
log_directory = 'log/XXX'

# Initialize the list to hold all the data
data = []

# Define the regex patterns to extract lr and bs from the first line and rmse and mae from the last line
lr_bs_pattern = re.compile(r"lr:(\d+\.?\d*), bs:(\d+)")
rmse_mae_pattern = re.compile(r"rmse', (\d+\.\d+).*'mae', (\d+\.\d+)")


# Function to extract lr and bs from the first line
def extract_lr_bs(first_line):
    match = lr_bs_pattern.search(first_line)
    if match:
        return match.groups()
    else:
        return None

# Function to extract rmse and mae from the last line
def extract_rmse_mae(last_line):
    match = rmse_mae_pattern.search(last_line)
    if match:
        return match.groups()
    else:
        return None

# Loop over each file in the directory
for file_name in os.listdir(log_directory):
    # Check if the file is a log file
    if file_name.endswith('.log'):  # Assuming the log files have a .log extension
        # Construct the full file path
        file_path = os.path.join(log_directory, file_name)
        
        # Read the first and last line of the file
        with open(file_path, 'r') as file:
            first_line = file.readline().strip()
            file.seek(0, os.SEEK_END)  # Seek to the end of the file
            file.seek(file.tell() - 1024, os.SEEK_SET)  # Go back 1024 bytes from the end of the file
            last_lines = file.readlines()  # Read to the end
            last_line = last_lines[-1].strip() if last_lines else None
        
        # Extract lr and bs from the first line
        lr_bs = extract_lr_bs(first_line)
        # Extract rmse and mae from the last line
        rmse_mae = extract_rmse_mae(last_line)

        # If both values were found, add them to the data list
        if lr_bs and rmse_mae:
            data.append([file_name] + list(lr_bs) + list(rmse_mae))

# Convert the data to a pandas DataFrame for better display and analysis
import pandas as pd

# Define the column names
columns = ['File Name', 'Learning Rate (lr)', 'Batch Size (bs)', 'RMSE', 'MAE']

# Create the DataFrame
df = pd.DataFrame(data, columns=columns)

# Save the DataFrame to a CSV file
csv_file_path = './jextracted_log_data.csv'
df.to_csv(csv_file_path, index=False)

# df.head()  # Display the first few rows of the DataFrame

df = df[df.columns[1:]]
df

Unnamed: 0,Learning Rate (lr),Batch Size (bs),RMSE,MAE
0,0.01,128,1.2813,0.3405
1,0.005,1024,1.2988,0.3488
2,0.005,2048,1.2794,0.3424
3,0.001,128,1.2791,0.3403
4,0.01,256,1.2856,0.3391
5,0.001,256,1.2894,0.3433
6,0.001,512,1.2782,0.3404
7,0.01,1024,1.2885,0.339
8,0.1,128,2.0598,0.752
9,0.0005,128,1.2797,0.3401


In [11]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import pandas as pd

fig = plt.figure(figsize=(8,5),dpi=180)
ax = Axes3D(fig)

x_name = "lr"
y_name = "bs"

xx = df["Learning Rate (lr)"]
yy = df["Batch Size (bs)"]

x = list(map(float, np.unique(xx)))
y = list(map(float,np.unique(yy)))

ax.set(xticks=x)
ax.set(yticks=y)

X,Y = np.meshgrid(x, y)
Z = np.zeros_like(X)

for row in range(len(X)):
    for col in range(len(X[0])):
        Z[row][col] = df[df["Learning Rate (lr)"] == X[row][col] and df["Batch Size (bs)"] == Y[row][col]]
print(Z)

# ax.set_title("MAE")
# data = np.array(df["MAE"])
# print(len(x), len(y), len(data))
# surf = ax.plot_surface(
#                 X,Y,data,
#                 rstride=1, # rstride（row）指定行的跨度
#                 cstride=1, # cstride(column)指定列的跨度
#                 cmap=plt.get_cmap('rainbow') # 设置颜色映射
#             ) # 这个的含义


# fig.colorbar(surf, shrink=0.8, aspect=13, pad=0)


# ax.w_xaxis.set_pane_color([1.0,1.0,1.0,1.0]) # 围墙
# ax.w_yaxis.set_pane_color([1.0,1.0,1.0,1.0]) # 围墙
# ax.w_zaxis.set_pane_color([1.0,1.0,1.0,1.0]) # 围墙

# ax.set_xlabel(x_name,size=15)
# ax.set_ylabel(y_name,size=15)
# ax.set_zlabel(zlabel_name,{"rotation":"vertical"},size=15)


# ax.grid(False)
# ax.view_init(elev=25,azim=55) # 这个是怎么用的

    

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

<Figure size 1440x900 with 0 Axes>

In [None]:
from sentence_transformers import SentenceTransformer
sentences_1 = ["样例数据-1", "样例数据-2"]
sentences_2 = ["样例数据-3", "样例数据-4"]
model = SentenceTransformer('BAAI/bge-large-zh-v1.5')
embeddings_1 = model.encode(sentences_1, normalize_embeddings=True)
embeddings_2 = model.encode(sentences_2, normalize_embeddings=True)
similarity = embeddings_1 @ embeddings_2.T
print(similarity)

### TODO
- 添加验证集
- NGCF
- LightGCN
- 所有代码整体过一遍
- tensorboard
- 训练参数整理
- checkpoint逻辑
- 日志 ✅

In [None]:

parser = argparse.ArgumentParser()
parser.add_argument(
    "--dataset", "-d", type=str, default="wsdream-rt", help="name of datasets"
)

args, _ = parser.parse_known_args()

config = Config(model="NeuMF", dataset=args.dataset)


init_logger(config)

dataset = GeneralDataset(config)
train_data, test_data = data_reparation(config, dataset)
model = NeuMF(config, dataset)
trainer = Trainer(config, model)

trainer.fit(train_data, test_data, saved=False, show_progress=True)
