In [1]:
import pandas as pd

# 读取CSV文件
file_path = 'wandb_export_2024-09-16T15_51_59.893+08_00.csv'  # 替换为实际文件路径
df = pd.read_csv(file_path)

# 查看总行数
total_rows = len(df)
print(f"文件总行数: {total_rows}")

# 筛选出userid为222的行
filtered_df = df[df['user_id'] == 22514]

# 查看筛选后的行
print(filtered_df)


文件总行数: 3001
   user_id   status                                     assistant_json
0    22514   failed                                                NaN
1    22514  success  {"reason": "The individual's trajectory shows ...


In [2]:
# 去掉status为failed的行
df = df[df['status'] != 'failed']

In [3]:
fix_df = pd.read_csv('wandb_export_2024-09-17T00_51_34.044+08_00.csv')  # 替换为你的修正文件路径

# 获取需要替换的user_id
# replace_user_ids = fix_df['user_id'].unique()
replace_user_ids = [22072, 22396, 22494, 23067]
# 从原始df中删除需要替换的user_id数据
df = df[~df['user_id'].isin(replace_user_ids)]
print(len(df))
fix_df = fix_df[fix_df['status'] == 'success']
fix_df = fix_df[fix_df['user_id'].isin(replace_user_ids)]

# 将fix_df中的数据合并到df中
df = pd.concat([df, fix_df], ignore_index=True)

# 重新排序（按user_id排序）
df_sorted = df.sort_values(by='user_id').reset_index(drop=True)

2996


In [4]:
# 查看总行数
total_rows = len(df_sorted)
print(f"压缩后的文件总行数: {total_rows}")

# 查看前几行
print("前几行内容:")
print(df_sorted.head())

压缩后的文件总行数: 3000
前几行内容:
   user_id   status                                     assistant_json
0    22000  success  {"reason": "The individual's trajectory shows ...
1    22001  success  {"reason": "The individual's trajectory shows ...
2    22002  success  {"reason": "The individual's trajectory shows ...
3    22003  success  {"reason": "The individual's trajectory shows ...
4    22004  success  {"reason": "The individual's trajectory shows ...


In [5]:
import json

def parse_json_column(json_str):
    return json.loads(json_str)

# 解析assistant_json列
df_sorted['parsed_json'] = df_sorted['assistant_json'].apply(parse_json_column)

In [6]:
df_sorted['prediction'] = df_sorted['parsed_json'].apply(lambda x: x['prediction'] if x else None)

# 查看解析后的数据
print("提取后的前几行内容:")
print(df_sorted[['user_id', 'status', 'prediction']].head())

# 保存提取后的结果为新文件
total_rows = len(df_sorted)
print(f"压缩后的文件总行数: {total_rows}")

提取后的前几行内容:
   user_id   status                                         prediction
0    22000  success  [[60, 2, 70, 107], [60, 16, 71, 107], [60, 17,...
1    22001  success  [[60, 9, 71, 60], [60, 14, 71, 60], [60, 41, 7...
2    22002  success  [[60, 13, 46, 139], [60, 14, 57, 153], [60, 15...
3    22003  success  [[60, 2, 174, 8], [60, 6, 174, 9], [60, 8, 174...
4    22004  success  [[60, 0, 51, 114], [60, 9, 51, 115], [60, 10, ...
压缩后的文件总行数: 3000


In [7]:
expanded_data = []

# 遍历每一行
for index, row in df_sorted.iterrows():
    user_id = row['user_id']
    predictions = row['prediction']
    
    # 遍历每个预测值，将其添加到expanded_data列表中
    for pred in predictions:  # 使用eval解析字符串形式的列表
        expanded_data.append([user_id] + pred)

# 将展开的数据转换为DataFrame
expanded_df = pd.DataFrame(expanded_data, columns=['user_id', 'd', 't', 'x', 'y'])

# 查看转换后的数据
print(expanded_df.head())
total_rows = len(expanded_df)
print(f"文件总行数: {total_rows}")
# 确保 expanded_df 已经创建
unique_users = expanded_df['user_id'].nunique()
print(f"展开后的数据包含 {unique_users} 个不同的用户。")
expanded_df.to_csv('cityB.csv.gz', compression='gzip', index=False, header=False)

   user_id   d   t   x    y
0    22000  60   2  70  107
1    22000  60  16  71  107
2    22000  60  17  76  101
3    22000  60  24  77  101
4    22000  60  26  77  101
文件总行数: 462816
展开后的数据包含 3000 个不同的用户。
