# Sentiment Analysis

## 提取Utterance和Sentiment列

In [7]:
import pandas as pd

# 读取CSV文件
file_path = 'E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/train_split.csv'
df = pd.read_csv(file_path)

# 提取Utterance和Sentiment列
extracted_df = df[['Utterance', 'Sentiment']]

# 保存提取的列到新的CSV文件
output_path = 'E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Sentiment_Analysis.csv'
extracted_df.to_csv(output_path, index=False)

print(f"提取的列已保存到 {output_path}")

提取的列已保存到 E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Sentiment_Analysis.csv


## 将csv文件转化为jsonw文件，其中instruction为utterance，out为sentiment。

In [8]:
import json

# 转换为JSONW格式
jsonw_data = []
for index, row in extracted_df.iterrows():
    jsonw_data.append({
        "instruction":"请判断以下话语的情感极性，是正面、负面还是中性？",
        "input": row['Utterance'],
        "output": row['Sentiment']
    })

# 保存为JSONW文件
output_path = 'E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Sentiment Analysis.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(jsonw_data, f, ensure_ascii=False, indent=4)

print(f"转换后的JSONW文件已保存到 {output_path}")

转换后的JSONW文件已保存到 E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Sentiment Analysis.json


# Emotion Recognition

In [2]:
import pandas as pd

# 读取CSV文件
file_path = 'E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/train_split.csv'
df = pd.read_csv(file_path)

# 提取Utterance和Sentiment列
extracted_df = df[['Utterance', 'Emotion']]

# 保存提取的列到新的CSV文件
output_path = 'E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Emotion Recognition.csv'
extracted_df.to_csv(output_path, index=False)

print(f"提取的列已保存到 {output_path}")

提取的列已保存到 E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Emotion Recognition.csv


In [3]:
import pandas as pd

# 读取CSV文件
file_path = 'E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Emotion Recognition.csv'
df = pd.read_csv(file_path, header=None, names=['Utterance', 'Emotion'])

# 提取Emotion列并去重
unique_emotions = df['Emotion'].unique()

# 输出Emotion的值列表
print("Emotion的唯一值列表:")
for emotion in unique_emotions:
    print(emotion)

Emotion的唯一值列表:
Emotion
neutral
relaxed
anger
depress
disgust
negative-other
worried
fear
happy
astonished
grateful
positive-other
sadness


In [5]:
import json

# 转换为JSONW格式
jsonw_data = []
for index, row in extracted_df.iterrows():
    jsonw_data.append({
        "instruction":"请判断以下话语的情绪，从以下情绪中选择一个进行回答：neutral,relaxed,anger,depres,disgust,worried,fear,happy,astonished,grateful,sadness。",
        "input": row['Utterance'],
        "output": row['Emotion']
    })

# 保存为JSONW文件
output_path = 'E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Emotion Recognition.json'
with open(output_path, 'w', encoding='utf-8') as f:
    json.dump(jsonw_data, f, ensure_ascii=False, indent=4)

print(f"转换后的JSONW文件已保存到 {output_path}")

转换后的JSONW文件已保存到 E:/Study/Course/Enter-Shakespeare-s-play-world/data/Emotion Recognition and Sentiment Analysis/CPED/Emotion Recognition.json


## Short version

In [11]:
# 按类别均匀抽样
sampled_df = df.groupby('Emotion').apply(lambda x: x.sample(n=min(1000 // len(df['Emotion'].unique()), len(x)), random_state=42)).reset_index(drop=True)
sampled_df.to_csv('ER_1000.csv', index=False)

In [12]:
# 按类别均匀抽样
sampled_df = df.groupby('Sentiment').apply(lambda x: x.sample(n=min(1000 // len(df['Sentiment'].unique()), len(x)), random_state=42)).reset_index(drop=True)
sampled_df.to_csv('SA_1000.csv', index=False)

# 构建验证集

In [8]:
import pandas as pd
import json

# 读取CSV文件
file_path = 'data/Emotion Recognition and Sentiment Analysis/CPED/test_split.csv'
df = pd.read_csv(file_path)

# 提取Utterance和Sentiment列
Sentiment_df = df[['Utterance', 'Sentiment']].head(2000)
# 提取Utterance和Emotion列
Emotion_df = df[['Utterance', 'Emotion']].head(2000)

# 转换为JSONL格式
output_path1 = 'data/Emotion Recognition and Sentiment Analysis/CPED/test_ER.jsonl'
with open(output_path1, 'w', encoding='utf-8') as f:
    for index, row in Emotion_df.iterrows():
        json_record = json.dumps({"input": row['Utterance'], "target": row['Emotion']}, ensure_ascii=False)
        f.write(json_record + '\n')
print(f"转换后的ER文件已保存到 {output_path1}")

output_path2 = 'data/Emotion Recognition and Sentiment Analysis/CPED/test_SA.jsonl'
with open(output_path2, 'w', encoding='utf-8') as f:
    for index, row in Sentiment_df.iterrows():
        json_record = json.dumps({"input": row['Utterance'], "target": row['Sentiment']}, ensure_ascii=False)
        f.write(json_record + '\n')
print(f"转换后的SA文件已保存到 {output_path2}")

转换后的ER文件已保存到 data/Emotion Recognition and Sentiment Analysis/CPED/test_ER.jsonl
转换后的SA文件已保存到 data/Emotion Recognition and Sentiment Analysis/CPED/test_SA.jsonl
