In [13]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification
import pandas as pd

In [14]:
# 加载模型和tokenizer
model_path = "./bert_sentiment_model.pth"  # 替换为您的模型文件路径
tokenizer = BertTokenizer.from_pretrained("./models/bert-base-uncased")  # 替换为训练时使用的预训练模型名称
model = BertForSequenceClassification.from_pretrained("./models/bert-base-uncased", num_labels=5)  # 修改num_labels为训练时的类别数量

# 加载权重到模型中
model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))
model.eval()  # 确保模型处于推理模式

# 定义情感分析函数
def predict_sentiment(text):
    inputs = tokenizer(
        text, 
        return_tensors="pt", 
        truncation=True, 
        max_length=128, 
        padding="max_length"
    )
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=-1).item()
    return predicted_class

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./models/bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  model.load_state_dict(torch.load(model_path, map_location=torch.device('cpu')))


In [15]:
df = pd.read_csv('./Data/enriched_movies.csv')
df.head

<bound method NDFrame.head of        movie_id                                   title  year  \
0             8  Edison Kinetoscopic Record of a Sneeze  1894   
1            10            La sortie des usines Lumière  1895   
2            12                  The Arrival of a Train  1896   
3            91                     Le manoir du diable  1896   
4           131                       Une nuit terrible  1896   
...         ...                                     ...   ...   
35698  15417330                                  Grudge  2021   
35699  15469820                       Britney vs Spears  2021   
35700  15655276                                    Yara  2021   
35701  15831978                                    Cash  2021   
35702  15839820                                  Sompoy  2021   

                              genres  tmdb_id  \
0           ['Documentary', 'Short']   105158   
1           ['Documentary', 'Short']      774   
2           ['Documentary', 'Short']      

In [20]:
review = df[['movie_id', 'reviews']]
review['reviews'] = review['reviews'].fillna('')
review.head

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  review['reviews'] = review['reviews'].fillna('')


<bound method NDFrame.head of        movie_id                                            reviews
0             8  There is just something magical being able to ...
1            10                                                   
2            12  This famous film was ACTUALLY MADE IN 1897, Lu...
3            91  According to Wikipedia in August of 2011, Le M...
4           131  Georges Méliès does it again in the same fashi...
...         ...                                                ...
35698  15417330                                                   
35699  15469820  Britney vs Spears has 'first world problem' wr...
35700  15655276                                                   
35701  15831978                                                   
35702  15839820                                                   

[35703 rows x 2 columns]>

In [23]:
def calculate_average_sentiment(dataframe):
    results = []
    for movie_id, reviews in dataframe.itertuples(index=False):
        # 分隔评论
        review_list = reviews.split('||')
        # 对每条评论预测情感分数
        sentiments = [predict_sentiment(review.strip()) for review in review_list]
        # 计算平均情感分
        average_sentiment = sum(sentiments) / len(sentiments) if sentiments else 4
        # 记录结果
        results.append({'movie_id': movie_id, 'average_sentiment': average_sentiment})

    # 返回结果数据框
    return pd.DataFrame(results)

# 示例调用
result_df = calculate_average_sentiment(review)
result_df.head

<bound method NDFrame.head of        movie_id  average_sentiment
0             8                3.0
1            10                2.0
2            12                3.0
3            91                2.5
4           131                2.0
...         ...                ...
35698  15417330                2.0
35699  15469820                0.0
35700  15655276                2.0
35701  15831978                2.0
35702  15839820                2.0

[35703 rows x 2 columns]>

In [24]:
result_df.to_csv('./sentiment-ave-result.csv')