In [1]:
import pandas as pd
import dgl
import torch
from itertools import combinations
from sklearn.metrics import jaccard_score
import os
from tqdm import tqdm
import re
import pickle
import numpy as np
from scipy.spatial import distance
import dgl
from tqdm import tqdm
from itertools import combinations
from stepmix import StepMix

讀取資料

In [2]:

df_ratings = pd.read_csv('D:\\CODE\\multi-model knowledge graph multi-graph recommendation system\\data\\cleanuser_rating.csv')
crew_info_df = pd.read_csv('D:\\CODE\\multi-model knowledge graph multi-graph recommendation system\\data\\final_data_cleaned.csv')

In [3]:
def extract_roles_and_initialize_graph(crew_info_df):
    """从crew_info数据中提取所有唯一的角色类型，并为每个角色创建唯一的ID映射。"""
    roles = {}  # 使用字典来存储每个角色的ID映射
    edge_types = []
    for _, row in crew_info_df.iterrows():
        crew_info = parse_crew_info(row['crew_info'])
        for role, ids in crew_info.items():
            if role not in roles:
                roles[role] = {}
            for person_id in ids:
                if person_id not in roles[role]:
                    roles[role][person_id] = len(roles[role])
            edge_types.append(('movie', f'has_{role}', role))
    return roles, edge_types

def parse_crew_info(crew_info_str):
    """解析制作团队信息字符串，返回字典格式：{角色: [id, ...]}"""
    # 使用正则表达式匹配所有的角色和ID组合
    pattern = re.compile(r"(\w+): ([\w, ]+)")
    matches = pattern.findall(crew_info_str)
    crew_info = {}
    for role, ids in matches:
        # 将ID字符串分割并去除空格，然后转换为列表
        crew_info[role] = ids.replace(' ', '').split(',')
    return crew_info


# 以下為目前知識圖的建構完整版
# 第一版 下面的是第二版加入了movieId
再來只需要把提取出的特徵放入就完成了

In [12]:
from stepmix import StepMix

import numpy as np
user_movie_matrix = pd.pivot_table(df_ratings, index='userId', columns='movieId', aggfunc='size', fill_value=0)

# 將 pivot table 轉換為 0 或 1，表示是否觀看
user_movie_matrix = (user_movie_matrix > 0).astype(int)

# 轉換成適用於 StepMix 的數據格式
data = user_movie_matrix.values
def hellinger_distance(p, q):
    return np.sqrt(np.sum((np.sqrt(p) - np.sqrt(q)) ** 2)) / np.sqrt(2)

def compute_user_similarity(membership_probabilities, user_ids, threshold):
    num_users = len(user_ids)
    user_similarity_edges = []
    for i in range(num_users):
        for j in range(i + 1, num_users):
            dist = hellinger_distance(membership_probabilities[i], membership_probabilities[j])
            if dist <= threshold:
                user_similarity_edges.append((i, j))
    return user_similarity_edges


def create_heterogeneous_graph(df_ratings, crew_info_df, Uthreshold=0.5, threshold=0.7):
    user_ids = df_ratings['userId'].unique()
    
    movie_ids = df_ratings['movieId'].unique()
    user_id_map = {uid: i for i, uid in enumerate(user_ids)}
    movie_id_map = {mid: i for i, mid in enumerate(movie_ids)}
    
    
    roles, edge_types = extract_roles_and_initialize_graph(crew_info_df)
    
    edge_types.extend([
        ('movie', 'has_image', 'movieimage'),
        ('movie', 'has_text', 'movietext'),
        ('movie', 'has_audio', 'movieaudio'),
        ('user', 'rates', 'movie'),  
        ('user', 'similar', 'user'), 
        ('movie', 'similar', 'movie') 
    ])
    g = dgl.heterograph({etype: [] for etype in edge_types},
                        num_nodes_dict={
                            'user': len(user_ids), 
                            'movie': len(movie_ids),
                            **{role: len(ids) for role, ids in roles.items()},
                            'movieimage': len(movie_ids),  
                            'movietext': len(movie_ids),
                            'movieaudio': len(movie_ids)
                        })
    for idx, row in tqdm(crew_info_df.iterrows(), total=crew_info_df.shape[0], desc="Processing crew info"):
        movie_id = row['movieId']
        if movie_id in movie_id_map:
            crew_info = parse_crew_info(row['crew_info'])
            for role, ids in crew_info.items():
                src = []
                dst = []
                for person_id in ids:
                    src.append(movie_id_map[movie_id])
                    dst.append(roles[role][person_id])
                g.add_edges(src, dst, etype=('movie', f'has_{role}', role))


    user_movie_matrix = df_ratings.pivot_table(index='userId', columns='movieId', aggfunc='size', fill_value=0)
    
    model = StepMix(n_components=3, measurement='categorical')
    model.fit(data)
    membership_probabilities = model.predict_proba(data)
    user_similarity = compute_user_similarity(membership_probabilities, user_ids, Uthreshold)
    movie_similarity = [(movie_id_map[m1], movie_id_map[m2]) for m1, m2 in combinations(movie_ids, 2)
                        if jaccard_score(user_movie_matrix[m1].fillna(0), user_movie_matrix[m2].fillna(0)) >= threshold]
    
    if user_similarity:
        src, dst = zip(*user_similarity)
        g.add_edges(src, dst, etype=('user', 'similar', 'user'))
    if movie_similarity:
        src, dst = zip(*movie_similarity)
        g.add_edges(src, dst, etype=('movie', 'similar', 'movie'))

    user_movie_interactions = [(user_id_map[row['userId']], movie_id_map[row['movieId']])
                               for index, row in df_ratings.iterrows()]
    if user_movie_interactions:
        src, dst = zip(*user_movie_interactions)
        g.add_edges(src, dst, etype=('user', 'rates', 'movie'))
    g.nodes['movie'].data['movie_id'] = torch.tensor([int(mid) for mid in movie_ids], dtype=torch.int64)

    return g


目前特徵提取的創建圖

In [14]:
hetero_graph = create_heterogeneous_graph(df_ratings, crew_info_df)
with open('new_hetero_graph05.pkl', 'wb') as f:
    pickle.dump(hetero_graph, f)
    
print(hetero_graph)
print(hetero_graph.ntypes, hetero_graph.etypes)

Processing crew info: 100%|██████████| 6011/6011 [00:19<00:00, 314.26it/s]


Fitting StepMix...


Initializations (n_init) : 100%|██████████| 1/1 [24:52<00:00, 1492.22s/it, max_LL=-1.81e+6, max_avg_LL=-149]


節點和邊的數量:
actor節點數量: 4566
actress節點數量: 3701
composer節點數量: 2011
director節點數量: 4010
editor節點數量: 2489
movie節點數量: 5996
movieaudio節點數量: 5996
movieimage節點數量: 5996
movietext節點數量: 5996
producer節點數量: 3952
self節點數量: 1049
user節點數量: 12171
writer節點數量: 3177
('movie', 'has_actor', 'actor')邊數量: 9809
('movie', 'has_actress', 'actress')邊數量: 8327
('movie', 'has_audio', 'movieaudio')邊數量: 0
('movie', 'has_composer', 'composer')邊數量: 5923
('movie', 'has_director', 'director')邊數量: 10163
('movie', 'has_editor', 'editor')邊數量: 4089
('movie', 'has_image', 'movieimage')邊數量: 0
('movie', 'has_producer', 'producer')邊數量: 9891
('movie', 'has_self', 'self')邊數量: 2138
('movie', 'has_text', 'movietext')邊數量: 0
('movie', 'has_writer', 'writer')邊數量: 7036
('movie', 'similar', 'movie')邊數量: 4852
('user', 'rates', 'movie')邊數量: 549919
('user', 'similar', 'user')邊數量: 28889524

('movie', 'has_actor', 'actor')
('movie', 'has_actress', 'actress')
('movie', 'has_audio', 'movieaudio')
('movie', 'has_composer', 'composer')
('movie', 'has_director', 'director')
('movie', 'has_editor', 'editor')
('movie', 'has_image', 'movieimage')
('movie', 'has_producer', 'producer')
('movie', 'has_self', 'self')
('movie', 'has_text', 'movietext')
('movie', 'has_writer', 'writer')
('movie', 'similar', 'movie')
('user', 'rates', 'movie')
('user', 'similar', 'user')

檢查預期的邊類型:
邊 ('user', 'rates', 'movie') 存在.
邊 ('movie', 'similar', 'movie') 存在.
邊 ('user', 'similar', 'user') 存在.
邊 ('movie', 'has_image', 'movieimage') 存在.
邊 ('movie', 'has_text', 'movietext') 存在.
邊 ('movie', 'has_audio', 'movieaudio') 存在.

In [16]:
import dgl
import pickle

with open('hetero_graph02.pkl', 'rb') as f:
    hetero_graph = pickle.load(f)

# 列印元圖，顯示所有的節點類型和邊類型
print("元圖結構:")
print(hetero_graph.metagraph().edges())

print("\n節點和邊的數量:")
for ntype in hetero_graph.ntypes:
    print(f"{ntype}節點數量: {hetero_graph.number_of_nodes(ntype)}")
for etype in hetero_graph.canonical_etypes:
    print(f"{etype}邊數量: {hetero_graph.number_of_edges(etype)}")

# 检檢查特定的邊類型是否存在
expected_edges = [
    ('user', 'rates', 'movie'),
    ('movie', 'similar', 'movie'),
    ('user', 'similar', 'user'),
    ('movie', 'has_image', 'movieimage'),
    ('movie', 'has_text', 'movietext'),
    ('movie', 'has_audio', 'movieaudio')
]

print("\n檢查預期的邊類型:")
for edge in expected_edges:
    if edge in hetero_graph.canonical_etypes:
        print(f"邊 {edge} 存在.")
    else:
        print(f"邊 {edge} 不存在.")


元圖結構:
[('movie', 'actor'), ('movie', 'actress'), ('movie', 'movieaudio'), ('movie', 'composer'), ('movie', 'director'), ('movie', 'editor'), ('movie', 'movieimage'), ('movie', 'producer'), ('movie', 'self'), ('movie', 'movietext'), ('movie', 'writer'), ('movie', 'movie'), ('user', 'movie'), ('user', 'user')]

節點和邊的數量:
actor節點數量: 4566
actress節點數量: 3701
composer節點數量: 2011
director節點數量: 4010
editor節點數量: 2489
movie節點數量: 5996
movieaudio節點數量: 5996
movieimage節點數量: 5996
movietext節點數量: 5996
producer節點數量: 3952
self節點數量: 1049
user節點數量: 12171
writer節點數量: 3177
('movie', 'has_actor', 'actor')邊數量: 9809
('movie', 'has_actress', 'actress')邊數量: 8327
('movie', 'has_audio', 'movieaudio')邊數量: 0
('movie', 'has_composer', 'composer')邊數量: 5923
('movie', 'has_director', 'director')邊數量: 10163
('movie', 'has_editor', 'editor')邊數量: 4089
('movie', 'has_image', 'movieimage')邊數量: 0
('movie', 'has_producer', 'producer')邊數量: 9891
('movie', 'has_self', 'self')邊數量: 2138
('movie', 'has_text', 'movietext')邊數量: 0
('movie',

# 之後要跑特徵提取使用特徵提取資料夾裡面的PY檔案

In [18]:
import torchvision.models as models
import torchvision.transforms as transforms
import pandas as pd
from moviepy.editor import VideoFileClip
import torchvision.transforms as transforms
import torchvision.models as models
import torch
from PIL import Image
import numpy as np

# resnet特徵提取

In [12]:
# 加载图
with open('hetero_graph.pkl', 'rb') as f:
    hetero_graph = pickle.load(f)

# 检查 'movie' 节点是否有 'movie_id' 属性
if 'movie_id' in hetero_graph.nodes['movie'].data:
    print("Movie IDs are available in the graph.")
    print("Sample Movie IDs:", hetero_graph.nodes['movie'].data['movie_id'][:5])
else:
    print("Movie IDs are not set in the graph.")


Movie IDs are available in the graph.
Sample Movie IDs: tensor([54995, 57368, 60161, 62956, 64614])


In [22]:
import pandas as pd
import torchvision.transforms as transforms
import torchvision.models as models
import torch
from PIL import Image
from moviepy.editor import VideoFileClip
import numpy as np
from tqdm import tqdm
import os
import pickle
import dgl

def load_youtube_to_movie_mapping(filepath):
    df = pd.read_csv(filepath)
    # 從 movieId 到 youtubeId 建立字典
    movie_to_youtube = pd.Series(df.youtubeId.values, index=df.movieId.astype(str)).to_dict()
    return movie_to_youtube

# def load_pretrained_resnet50():
#     resnet50 = models.resnet50(pretrained=True)
#     resnet50.eval()
#     resnet50 = resnet50.to('cuda')
#     return resnet50
def load_pretrained_resnet50():
    resnet50 = models.resnet50(pretrained=True)
    # 修改模型以仅使用到池化层的输出
    resnet50 = torch.nn.Sequential(*list(resnet50.children())[:-1])
    resnet50.eval()
    resnet50 = resnet50.to('cuda')
    return resnet50

def preprocess_frames(frames, size=(224, 224)):
    preprocess = transforms.Compose([
        transforms.Resize(size),
        transforms.CenterCrop(size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    batch_tensor = torch.stack([preprocess(Image.fromarray(frame)) for frame in frames])
    return batch_tensor.to('cuda')

# def extract_features_from_video(video_path, model, preprocess, start_sec=10, end_sec=30, fps=1, batch_size=32):
#     try:
#         clip = VideoFileClip(video_path).subclip(start_sec, end_sec)
#         total_frames = int((end_sec - start_sec) * fps)
#         frame_features = []
#         batch = []
#         # 將 tqdm 添加到迭代影片幀的循環中
#         for frame in tqdm(clip.iter_frames(fps=fps, dtype='uint8'), total=total_frames, desc=f"Processing {os.path.basename(video_path)}"):
#             batch.append(frame)
#             if len(batch) == batch_size:
#                 img_tensor = preprocess(batch)
#                 with torch.no_grad():
#                     features = model(img_tensor)
#                 frame_features.extend(features.cpu().numpy())
#                 batch = []  # 清空批量列表以進行下一批次的處理

#         # 處理剩餘的幀（如果有）
#         if batch:
#             img_tensor = preprocess(batch)
#             with torch.no_grad():
#                 features = model(img_tensor)
#             frame_features.extend(features.cpu().numpy())

#         clip.close()
#         return np.mean(frame_features, axis=0)
#     except Exception as e:
#         print(f"Error processing video {video_path}: {str(e)}")
#         return None

def extract_features_from_video(video_path, model, preprocess, start_sec=10, end_sec=30, fps=1, batch_size=32):# start_sec=10, end_sec=30是預告片的秒數之後要改回3-180
    try:
        clip = VideoFileClip(video_path).subclip(start_sec, end_sec)
        total_frames = int((end_sec - start_sec) * fps)
        frame_features = []
        batch = []
        for frame in tqdm(clip.iter_frames(fps=fps, dtype='uint8'), total=total_frames, desc=f"Processing {os.path.basename(video_path)}"):
            batch.append(frame)
            if len(batch) == batch_size:
                img_tensor = preprocess(batch)
                with torch.no_grad():
                    features = model(img_tensor)
                    # 确保特征被展平到正确的维度
                    features = features.view(features.size(0), -1)
                frame_features.extend(features.cpu().numpy())
                batch = []  # Reset batch

        # 处理剩余的帧（如果有）
        if batch:
            img_tensor = preprocess(batch)
            with torch.no_grad():
                features = model(img_tensor)
                # 确保特征被展平到正确的维度
                features = features.view(features.size(0), -1)
            frame_features.extend(features.cpu().numpy())

        clip.close()
        return np.mean(frame_features, axis=0)
    except Exception as e:
        print(f"Error processing video {video_path}: {str(e)}")
        return None

def add_features_to_graph(g, features_dict):
    # 获取电影图像节点的数量
    num_movies = g.number_of_nodes('movieimage')
    
    # 预设特征矩阵，假设我们事先知道特征的维度，例如使用ResNet50提取的特征维度是2048
    num_features = 2048  # 或者根据第一个有效特征动态获取
    features_tensor = torch.zeros((num_movies, num_features))

    # 添加特征到图中
    movie_indices = []
    for movie_id, features in features_dict.items():
        # 尝试获取对应movie_id的索引，假设movie和movieimage索引已对齐
        try:
            idx = g.nodes['movie'].data['movie_id'].tolist().index(int(movie_id))
            movie_indices.append(idx)
            if features is not None:
                # 转换特征为torch张量
                feature_tensor = torch.tensor(features, dtype=torch.float32)
                # 确保特征维度正确
                if feature_tensor.numel() == num_features:
                    features_tensor[idx] = feature_tensor
                else:
                    print(f"特征维度不一致: {feature_tensor.numel()} 期望: {num_features}")
            else:
                # 如果没有特征也更新索引，但已初始化为0
                print(f"无特征数据: movie_id {movie_id}")
        except ValueError:
            print(f"movie_id {movie_id} 在图中找不到对应的索引")

    # 更新图中的特征数据
    g.nodes['movieimage'].data['features'] = features_tensor
    print("特征已成功添加到图中。")

# 加载现有图
with open('hetero_graph.pkl', 'rb') as f:
    hetero_graph = pickle.load(f)

# 执行特征提取和映射
movie_to_youtube = load_youtube_to_movie_mapping('D:\\CODE\\multi-model knowledge graph multi-graph recommendation system\\data\\ml-youtube_cleaned.csv')
resnet50 = load_pretrained_resnet50()
video_folder = 'D:\\CODE\\multi-model knowledge graph multi-graph recommendation system\\data\\videos'
features_dict = {}


# Extract and store features for the first 10 movies per user
user_movie_edges = hetero_graph.edges(etype=('user', 'rates', 'movie'))
features_dict = {}

# Process only the first 10 movies for each user 這邊是取每個使用者的前10部電影的預告片之後要改回所有的預告片
for user_id in torch.unique(user_movie_edges[0]):
    movie_ids = user_movie_edges[1][user_movie_edges[0] == user_id][:10]  # Only take first 10 movies
    for movie_id in movie_ids:
        movie_id = hetero_graph.nodes['movie'].data['movie_id'][movie_id].item()  # Get actual movie ID
        youtube_id = movie_to_youtube.get(str(movie_id))
        if youtube_id:
            video_path = os.path.join(video_folder, f"{youtube_id}.mp4")
            if os.path.exists(video_path):
                features = extract_features_from_video(video_path, resnet50, preprocess_frames)
                if features is not None:
                    features_dict[str(movie_id)] = features

# # 加上 tqdm 進度條追蹤提取特徵的進度
# for movie_id, youtube_id in tqdm(movie_to_youtube.items(), desc="Extracting features from videos"):
#     video_path = os.path.join(video_folder, f"{youtube_id}.mp4")
#     if os.path.exists(video_path):
#         features = extract_features_from_video(video_path, resnet50)
#         features_dict[movie_id] = features

add_features_to_graph(hetero_graph, features_dict)

# 保存更新后的图像特征图
with open('hetero_graph_with_images.pkl', 'wb') as f:
    pickle.dump(hetero_graph, f)


Processing IvNkGm8mxiM.mp4: 100%|██████████| 20/20 [00:00<00:00, 104.99it/s]
Processing xGwC7U6Sf3o.mp4: 100%|██████████| 20/20 [00:00<00:00, 203.67it/s]
Processing 0aHJmM2uvBY.mp4: 100%|██████████| 20/20 [00:00<00:00, 269.47it/s]
Processing vLkBUix_D3U.mp4: 100%|██████████| 20/20 [00:01<00:00, 18.90it/s]
Processing unSbtED22Fw.mp4: 100%|██████████| 20/20 [00:00<00:00, 168.80it/s]
Processing YkHmYJmfuWg.mp4: 100%|██████████| 20/20 [00:00<00:00, 105.78it/s]
Processing -NeQ6aGWX74.mp4: 100%|██████████| 20/20 [00:00<00:00, 25.81it/s]
Processing 4aamMJ_8qZ4.mp4: 100%|██████████| 20/20 [00:00<00:00, 47.53it/s]
Processing tDe3kbmTlCE.mp4: 100%|██████████| 20/20 [00:00<00:00, 60.97it/s]
Processing AIzbwV7on6Q.mp4: 100%|██████████| 20/20 [00:01<00:00, 15.30it/s]
Processing t2koYVqwzT4.mp4: 100%|██████████| 20/20 [00:00<00:00, 37.93it/s]
Processing YOOIK0baLvM.mp4: 100%|██████████| 20/20 [00:03<00:00,  5.02it/s]
Processing jj6wcUes1no.mp4: 100%|██████████| 20/20 [00:01<00:00, 18.09it/s]
Process

特征已成功添加到图中。


多圖+一秒取一幀

In [None]:
import pandas as pd
import torchvision.transforms as transforms
import torchvision.models as models
import torch
from PIL import Image
from moviepy.editor import VideoFileClip
import numpy as np
from tqdm import tqdm
import os
import pickle
import dgl

def load_youtube_to_movie_mapping(filepath):
    df = pd.read_csv(filepath)
    # 从 movieId 到 youtubeId 创建字典
    movie_to_youtube = pd.Series(df.youtubeId.values, index=df.movieId.astype(str)).to_dict()
    return movie_to_youtube

def load_pretrained_resnet50():
    resnet50 = models.resnet50(pretrained=True)
    # 将模型修改为仅到池化层的输出
    resnet50 = torch.nn.Sequential(*list(resnet50.children())[:-1])
    resnet50.eval()
    resnet50 = resnet50.to('cuda')
    return resnet50

def preprocess_frames(frames, size=(224, 224)):
    preprocess = transforms.Compose([
        transforms.Resize(size),
        transforms.CenterCrop(size),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    batch_tensor = torch.stack([preprocess(Image.fromarray(frame)) for frame in frames])
    return batch_tensor.to('cuda')

def extract_features_from_video(video_path, model, preprocess, start_sec=10, end_sec=30, fps=1, batch_size=32):
    try:
        clip = VideoFileClip(video_path).subclip(start_sec, end_sec)
        total_frames = int((end_sec - start_sec) * fps)
        frame_features = []
        batch = []
        for frame in tqdm(clip.iter_frames(fps=fps, dtype='uint8'), total=total_frames, desc=f"Processing {os.path.basename(video_path)}"):
            batch.append(frame)
            if len(batch) == batch_size:
                img_tensor = preprocess(batch)
                with torch.no_grad():
                    features = model(img_tensor)
                    features = features.view(features.size(0), -1)  # 确保特征被展平
                frame_features.extend(features.cpu().numpy())
                batch = []  # Reset batch

        if batch:  # 处理剩余的帧
            img_tensor = preprocess(batch)
            with torch.no_grad():
                features = model(img_tensor)
                features = features.view(features.size(0), -1)
            frame_features.extend(features.cpu().numpy())

        clip.close()
        return np.mean(frame_features, axis=0)
    except Exception as e:
        print(f"Error processing video {video_path}: {str(e)}")
        return None

def add_features_to_graph(g, features_dict):
    num_movies = g.number_of_nodes('movieimage')
    num_features = 2048
    features_tensor = torch.zeros((num_movies, num_features))

    for movie_id, features in features_dict.items():
        try:
            idx = g.nodes['movie'].data['movie_id'].tolist().index(int(movie_id))
            if features is not None:
                feature_tensor = torch.tensor(features, dtype=torch.float32)
                if feature_tensor.numel() == num_features:
                    features_tensor[idx] = feature_tensor
                else:
                    print(f"特征维度不一致: {feature_tensor.numel()} 期望: {num_features}")
            else:
                print(f"无特征数据: movie_id {movie_id}")
        except ValueError:
            print(f"movie_id {movie_id} 在图中找不到对应的索引")

    g.nodes['movieimage'].data['features'] = features_tensor
    print("特征已成功添加到图中。")

movie_to_youtube = load_youtube_to_movie_mapping('D:\\CODE\\multi-model knowledge graph multi-graph recommendation system\\data\\ml-youtube_cleaned.csv')
resnet50 = load_pretrained_resnet50()
video_folder = 'D:\\CODE\\multi-model knowledge graph multi-graph recommendation system\\data\\videos'
graphs_to_process = [r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph03.pkl', r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph05.pkl', r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph02.pkl']

for graph_path in graphs_to_process:
    with open(graph_path, 'rb') as f:
        hetero_graph = pickle.load(f)

    features_dict = {}
    user_movie_edges = hetero_graph.edges(etype=('user', 'rates', 'movie'))
    for user_id in torch.unique(user_movie_edges[0]):
        movie_ids = user_movie_edges[1][user_movie_edges[0] == user_id]
        for movie_id in movie_ids:
            actual_movie_id = hetero_graph.nodes['movie'].data['movie_id'][movie_id].item()
            youtube_id = movie_to_youtube.get(str(actual_movie_id))
            if youtube_id:
                video_path = os.path.join(video_folder, f"{youtube_id}.mp4")
                if os.path.exists(video_path):
                    features = extract_features_from_video(video_path, resnet50, preprocess_frames)
                    if features is not None:
                        features_dict[str(actual_movie_id)] = features

    add_features_to_graph(hetero_graph, features_dict)
    updated_graph_path = graph_path.replace('.pkl', '_with_features.pkl')
    with open(updated_graph_path, 'wb') as f:
        pickle.dump(hetero_graph, f)


In [22]:
import torch

# 检查CUDA设备是否可用
print("CUDA available:", torch.cuda.is_available())
print("Current device:", torch.cuda.current_device())
print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))

# 检查模型是否在CUDA上
print("Is model on CUDA:", next(resnet50.parameters()).is_cuda)


CUDA available: True
Current device: 0
Device name: NVIDIA GeForce RTX 4060 Ti
Is model on CUDA: True


In [23]:
def check_missing_features(graph):
    missing_features_movies = []
    # 假设特征存储在名为 'features' 的节点數據中
    for movie_id in graph.nodes['movie'].data['movie_id']:
        if 'features' not in graph.nodes['movie'].data or graph.nodes['movie'].data['features'][movie_id].sum() == 0:
            missing_features_movies.append(movie_id.item())

    if missing_features_movies:
        print("Movies without features:", missing_features_movies)
    else:
        print("All movies have features.")

# 假設 hetero_graph 是您已經加載的圖
with open('hetero_graph_with_images.pkl', 'rb') as f:
    hetero_graph = pickle.load(f)

check_missing_features(hetero_graph)


Movies without features: [54995, 57368, 60161, 62956, 64614, 64999, 65514, 66297, 67295, 67799, 67867, 68263, 69526, 70305, 71282, 82095, 61160, 63082, 63992, 68205, 68954, 69122, 69406, 70286, 71535, 72378, 73321, 74789, 76251, 77561, 78209, 79091, 79695, 81562, 81564, 82459, 82461, 84944, 86880, 86911, 87222, 87232, 87430, 88125, 88744, 91500, 91529, 95167, 102125, 102445, 106487, 106489, 106696, 112852, 118696, 125916, 79132, 89085, 92259, 104337, 104841, 104879, 104944, 106766, 106916, 106920, 107069, 107141, 109374, 109673, 110730, 111360, 111622, 111921, 112138, 112290, 112556, 116797, 117511, 117590, 55069, 61236, 61240, 62849, 65596, 66509, 67997, 68194, 69640, 71156, 71379, 71579, 72011, 72733, 73101, 73344, 74787, 77800, 79224, 79463, 80463, 81164, 81817, 81845, 83132, 84116, 84880, 85774, 85881, 86781, 86833, 87192, 87306, 88235, 89118, 89305, 89580, 89753, 89759, 89864, 64839, 66934, 67087, 67255, 68237, 73023, 73323, 74458, 74510, 74545, 78266, 78574, 80549, 80839, 80860, 

In [24]:
# 检查是否存在 'movieimage' 类型的节点以及它们是否有 'features' 属性
if 'movieimage' in hetero_graph.ntypes and 'features' in hetero_graph.nodes['movieimage'].data:
    features_tensor = hetero_graph.nodes['movieimage'].data['features']
    print("Features are stored in the graph under 'movieimage' nodes.")
    print(f"Shape of features: {features_tensor.shape}")
    print("Sample features:", features_tensor[:5])  # 显示前5个特征向量的样例
else:
    print("Features have not been added to the graph.")


Features are stored in the graph under 'movieimage' nodes.
Shape of features: torch.Size([243, 2048])
Sample features: tensor([[0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0000],
        [0.1471, 0.5938, 0.7490,  ..., 0.1158, 0.3637, 0.1672],
        [0.2117, 0.5229, 0.4258,  ..., 0.2200, 0.3178, 0.2677],
        [0.4625, 0.3683, 0.3002,  ..., 0.1502, 0.2949, 0.2470],
        [0.1799, 0.4742, 0.3313,  ..., 0.1847, 0.3477, 0.1577]])


# 文字的特徵提取

In [25]:
import pandas as pd
import os
import pickle
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from sentence_transformers import SentenceTransformer
import torch

In [None]:
from pydub import AudioSegment

def enhance_audio(audio_path):
    sound = AudioSegment.from_file(audio_path)
    # 增加音量
    louder = sound + 10  # 增加10dB
    enhanced_audio_path = "enhanced_" + audio_path
    louder.export(enhanced_audio_path, format="wav")
    return enhanced_audio_path


In [None]:


# 加載電影ID到YouTube ID的映射
def load_youtube_to_movie_mapping(filepath):
    df = pd.read_csv(filepath)
    movie_to_youtube = pd.Series(df.youtubeId.values, index=df.movieId.astype(str)).to_dict()
    return movie_to_youtube

# 提取音頻並轉為文本
def extract_audio_to_text(video_path, lang='en-US'):
    try:
        clip = VideoFileClip(video_path)
        audio_path = "temp_audio.wav"
        clip.audio.write_audiofile(audio_path)
        r = sr.Recognizer()
        with sr.AudioFile(audio_path) as source:
            audio = r.record(source)
        text = r.recognize_google(audio, language=lang)
        os.remove(audio_path)  # 清理暫存檔案
        clip.close()
        return text
    except Exception as e:
        print(f"Error processing {video_path}: {str(e)}")
        return None

# 轉換文本為向量
def text_to_vector(text):
    model = SentenceTransformer('all-MiniLM-L6-v2')
    embedding = model.encode(text)
    return embedding



def add_text_features_to_graph(g, movie_id, text_features):
    # 确保 movietext 节点存在并有正确的特征维度初始化
    if 'features' not in g.nodes['movietext'].data:
        num_movies = g.number_of_nodes('movietext')
        num_features = len(text_features)  # 假设所有特征向量维度相同
        g.nodes['movietext'].data['features'] = torch.zeros((num_movies, num_features), dtype=torch.float32)
    
    # 确定索引并添加特征
    idx = g.nodes['movie'].data['movie_id'].tolist().index(int(movie_id))
    g.nodes['movietext'].data['features'][idx] = torch.tensor(text_features, dtype=torch.float32)


# 載入含影像特徵的圖
def load_graph_with_images(graph_path):
    with open(graph_path, 'rb') as f:
        graph = pickle.load(f)
    return graph

# def process_videos(graph, mapping_file, video_folder):
#     movie_to_youtube = load_youtube_to_movie_mapping(mapping_file)
#     movie_ids = graph.nodes['movie'].data['movie_id'].tolist()
#     print(f"Total movies in graph: {len(movie_ids)}")  # 打印圖中電影總數

#     processed_count = 0  # 計數器，追蹤處理了多少視頻

#     for movie_id in movie_ids:
#         youtube_id = movie_to_youtube.get(str(movie_id))
#         if youtube_id:
#             video_path = os.path.join(video_folder, f"{youtube_id}.mp4")
#             if os.path.exists(video_path):
#                 print(f"Processing video for movie ID {movie_id}: {video_path}")  # 打印正在處理的視頻路徑
#                 text = extract_audio_to_text(video_path)
#                 if text:
#                     print(f"Extracted text for movie ID {movie_id}: {text[:30]}...")  # 打印提取的文本概要
#                     text_vector = text_to_vector(text)
#                     add_text_features_to_graph(graph, movie_id, text_vector)
#                     processed_count += 1
#                 else:
#                     print(f"No text extracted for movie ID {movie_id}.")
#             else:
#                 print(f"Video file not found: {video_path}")
#         else:
#             print(f"No YouTube ID found for movie ID {movie_id}.")

#     print(f"Total processed videos: {processed_count}")  # 打印處理的視頻數量
def process_videos(graph, mapping_file, video_folder):
    movie_to_youtube = load_youtube_to_movie_mapping(mapping_file)
    
    # 收集每个用户的前10个互动电影
    user_movie_edges = graph.edges(etype=('user', 'rates', 'movie'))
    top_movies_per_user = {}
    for user_id, movie_id in zip(user_movie_edges[0].numpy(), user_movie_edges[1].numpy()):
        if user_id not in top_movies_per_user:
            top_movies_per_user[user_id] = []
        if len(top_movies_per_user[user_id]) < 10:
            top_movies_per_user[user_id].append(movie_id)

    # 将列表扁平化，获取所有需要处理的电影ID
    top_movie_ids = set([movie_id for movie_list in top_movies_per_user.values() for movie_id in movie_list])

    processed_count = 0
    for movie_id in top_movie_ids:
        actual_movie_id = graph.nodes['movie'].data['movie_id'][movie_id].item()  # 获取实际的movie_id
        youtube_id = movie_to_youtube.get(str(actual_movie_id))
        if youtube_id:
            video_path = os.path.join(video_folder, f"{youtube_id}.mp4")
            if os.path.exists(video_path):
                print(f"Processing video for movie ID {actual_movie_id}: {video_path}")
                text = extract_audio_to_text(video_path)
                if text:
                    text_vector = text_to_vector(text)
                    add_text_features_to_graph(graph, actual_movie_id, text_vector)
                    processed_count += 1
                else:
                    print(f"No text extracted for movie ID {actual_movie_id}.")
            else:
                print(f"Video file not found: {video_path}")
        else:
            print(f"No YouTube ID found for movie ID {actual_movie_id}.")

    print(f"Total processed videos: {processed_count}")

# 繼續使用前面的設置和調用程式碼


# 讀取圖並處理文本
graph_path = 'hetero_graph_with_images.pkl'
video_folder = r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\data\videos'
mapping_file = r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\data\ml-youtube_cleaned.csv'
hetero_graph = load_graph_with_images(graph_path)

process_videos(hetero_graph, mapping_file, video_folder)

# 保存更新后的圖像和文本特徵圖
with open('hetero_graph_with_images_text.pkl', 'wb') as f:
    pickle.dump(hetero_graph, f)


In [2]:
import os

def check_non_mp4_files(directory):
    non_mp4_files = [file for file in os.listdir(directory) if not file.endswith('.mp4')]
    return non_mp4_files

# 指定要檢查的文件夾路徑
video_folder = r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\data\videos'
non_mp4_files = check_non_mp4_files(video_folder)

if non_mp4_files:
    print("Non-MP4 files found:")
    for file in non_mp4_files:
        print(file)
else:
    print("No non-MP4 files found in the directory.")


Non-MP4 files found:
EAPy76vxF5s.f251.webm.part
N07qVsND8p4.f244.webm.part
pTbIu8Zeqp0.f251.webm.part


In [31]:
import torch

# 假设 hetero_graph 是你的图变量
if 'movietext' in hetero_graph.ntypes:
    if 'features' in hetero_graph.nodes['movietext'].data:
        features = hetero_graph.nodes['movietext'].data['features']
        print("Features are available in 'movietext' nodes.")
        print(f"Shape of the features: {features.shape}")
    else:
        print("No features data found in 'movietext' nodes.")
else:
    print("No 'movietext' node type found in the graph.")
if 'features' in hetero_graph.nodes['movietext'].data:
    features = hetero_graph.nodes['movietext'].data['features']
    if features.shape[1] == 384:  # 假设每个特征向量长度为384
        print("Features have the correct dimension.")
    else:
        print(f"Incorrect dimension of features: {features.shape[1]}")
if 'features' in hetero_graph.nodes['movietext'].data:
    features = hetero_graph.nodes['movietext'].data['features']
    print(f"Average of features: {torch.mean(features)}")
    print(f"Non-zero elements in features: {torch.count_nonzero(features)}")
# 保存图
with open('hetero_graph_with_images_text.pkl', 'wb') as f:
    pickle.dump(hetero_graph, f)

# 加载图
with open('hetero_graph_with_images_text.pkl', 'rb') as f:
    loaded_graph = pickle.load(f)


Features are available in 'movietext' nodes.
Shape of the features: torch.Size([243, 384])
Features have the correct dimension.
Average of features: 2.932727693405468e-05
Non-zero elements in features: 18816


# 聲音的特徵提取

In [1]:
import torch
import pickle

# 检查音频特征是否正确加入图中
def check_audio_features(graph):
    if 'movieaudio' in graph.ntypes:
        if 'features' in graph.nodes['movieaudio'].data:
            features = graph.nodes['movieaudio'].data['features']
            print("Audio features are available in 'movieaudio' nodes.")
            print(f"Shape of the features: {features.shape}")
            # 假设你知道音频特征向量的预期长度
            expected_feature_length = 128  # 假设音频特征长度为128
            if features.shape[1] == expected_feature_length:
                print("Features have the correct dimension.")
            else:
                print(f"Incorrect dimension of features: {features.shape[1]}")
            print(f"Average of features: {torch.mean(features)}")
            print(f"Non-zero elements in features: {torch.count_nonzero(features)}")
        else:
            print("No features data found in 'movieaudio' nodes.")
    else:
        print("No 'movieaudio' node type found in the graph.")

# 保存图
def save_graph(graph, filename):
    with open(filename, 'wb') as f:
        pickle.dump(graph, f)
    print(f"Graph saved to {filename}")

# 加载图
def load_graph(filename):
    with open(filename, 'rb') as f:
        return pickle.load(f)

# 使用以上定义的函数
filename = 'hetero_graph_with_images_text_audio.pkl'
save_graph(hetero_graph, filename)
loaded_graph = load_graph(filename)
check_audio_features(loaded_graph)


KeyboardInterrupt: 

In [50]:
if hetero_graph is not None:
    save_graph(hetero_graph, filename)
    print("图已被确认非空并尝试保存。")
else:
    print("图是空的，保存操作被跳过。")
# 保存图
save_graph(hetero_graph, filename)

# 立即尝试加载图
loaded_graph = load_graph(filename)

if loaded_graph is not None:
    print("图成功加载。")
    check_audio_features(loaded_graph)
else:
    print("加载图失败，对象为空。")
import pickle

def save_graph(graph, filename):
    try:
        with open(filename, 'wb') as f:
            pickle.dump(graph, f)
        print(f"图已保存到 {filename}")
    except Exception as e:
        print(f"保存图时发生错误: {e}")

def load_graph(filename):
    try:
        with open(filename, 'rb') as f:
            return pickle.load(f)
    except Exception as e:
        print(f"加载图时发生错误: {e}")
        return None


None
<class 'NoneType'>


In [3]:
import pandas as pd
import os
import pickle
import torch
from moviepy.editor import VideoFileClip
import speech_recognition as sr
from sentence_transformers import SentenceTransformer

def load_youtube_to_movie_mapping(filepath):
    df = pd.read_csv(filepath)
    movie_to_youtube = pd.Series(df.youtubeId.values, index=df.movieId.astype(str)).to_dict()
    return movie_to_youtube

def extract_audio_to_text(video_path, lang='en-US', start_sec=10, end_sec=60):
    try:
        clip = VideoFileClip(video_path).subclip(start_sec, end_sec)
        audio_path = "temp_audio.wav"
        clip.audio.write_audiofile(audio_path)
        r = sr.Recognizer()
        with sr.AudioFile(audio_path) as source:
            audio = r.record(source)
        text = r.recognize_google(audio, language=lang)
        os.remove(audio_path)
        clip.close()
        return text
    except Exception as e:
        print(f"Error processing {video_path}: {str(e)}")
        return None

def text_to_vector_or_zero(text, vector_length=384):
    if text:
        model = SentenceTransformer('all-MiniLM-L6-v2')
        return model.encode(text)
    else:
        return torch.zeros(vector_length)

def add_text_features_to_graph(g, movie_id, text_features):
    if 'features' not in g.nodes['movietext'].data:
        num_movies = g.number_of_nodes('movietext')
        num_features = text_features.shape[0] if text_features is not None else 384
        g.nodes['movietext'].data['features'] = torch.zeros((num_movies, num_features), dtype=torch.float32)
    idx = g.nodes['movie'].data['movie_id'].tolist().index(int(movie_id))
    g.nodes['movietext'].data['features'][idx] = torch.tensor(text_features, dtype=torch.float32)

def process_videos(graph, mapping_file, video_folder, output_file):
    movie_to_youtube = load_youtube_to_movie_mapping(mapping_file)
    all_movie_ids = set([mid.item() for mid in graph.nodes['movie'].data['movie_id']])
    processed_count = 0
    problematic_movies = []

    for movie_id in all_movie_ids:
        youtube_id = movie_to_youtube.get(str(movie_id))
        if youtube_id:
            video_path = os.path.join(video_folder, f"{youtube_id}.mp4")
            if os.path.exists(video_path):
                text = extract_audio_to_text(video_path)
                text_vector = text_to_vector_or_zero(text)
                add_text_features_to_graph(graph, movie_id, text_vector)
                processed_count += 1
                if text is None:
                    problematic_movies.append(movie_id)
                    print(f"No text extracted for movie ID {movie_id}.")
            else:
                problematic_movies.append(movie_id)
                print(f"Video file not found: {video_path}")
        else:
            problematic_movies.append(movie_id)
            print(f"No YouTube ID found for movie ID {movie_id}.")

    print(f"Total processed videos: {processed_count}")
    with open(output_file, 'w') as f:
        for movie_id in problematic_movies:
            f.write(f"{movie_id}\n")

# File paths
video_folder = r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\data\videos'
mapping_file = r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\data\ml-youtube_cleaned.csv'

# Process each graph
graphs_to_process = [
    r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph03.pkl',
    r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph05.pkl',
    r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph08.pkl',
    r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph03_with_features.pkl',
    r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph05_with_features.pkl',
    r'D:\CODE\multi-model knowledge graph multi-graph recommendation system\code\mainmodel\hetero_graph08_with_features.pkl'
]

for graph_path in graphs_to_process:
    graph = pickle.load(open(graph_path, 'rb'))
    output_file = graph_path.replace('.pkl', '_problematic_movies.txt')
    process_videos(graph, mapping_file, video_folder, output_file)
    updated_graph_path = graph_path.replace('.pkl', '_with_text_features.pkl')
    with open(updated_graph_path, 'wb') as f:
        pickle.dump(graph, f)


NameError: name '_C' is not defined