In [None]:
import pandas as pd
import json
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
import nltk
from sklearn.tree import DecisionTreeClassifier
#import keras
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import keras
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout

In [None]:
# 加载 JSON 数据
tweets_data = []
with open('tweets_DM.json', 'r') as file:
    for line in file:
        tweet = json.loads(line)
        # 提取嵌套字段 _source.tweet
        tweet_data = tweet.get('_source', {}).get('tweet', {})
        tweets_data.append(tweet_data)

# 转换为 DataFrame
tweets_df = pd.DataFrame(tweets_data)

# 查看结果
print(tweets_df.head())


In [None]:
# 加载 CSV 文件
emotion = pd.read_csv('emotion.csv')
data_identification = pd.read_csv('data_identification.csv')

# 合并数据：推文内容与情绪标签
tweets_with_emotion = pd.merge(tweets_df, emotion, on='tweet_id', how='left')

# 合并数据：训练/测试集标识
complete_data = pd.merge(tweets_with_emotion, data_identification, on='tweet_id', how='left')

# 查看结果
print(complete_data.head())


In [None]:


# 清理推文文本
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # 去除URL
    text = re.sub(r"[^\w\s#@]", "", text)  # 保留文字、# 和 @
    return text.strip()

complete_data['text'] = complete_data['text'].apply(clean_text)
complete_data=complete_data.drop(['hashtags'],axis=1)

In [None]:
complete_data

In [None]:
train_df = complete_data[complete_data['identification'] == 'train']
test_df = complete_data[complete_data['identification'] == 'test']
print(train_df)
print(test_df)

In [None]:
# the histogram of the data
labels = train_df['emotion'].unique()
post_total = len(train_df)
df1 = train_df.groupby(['emotion']).count()['text']
df1 = df1.apply(lambda x: round(x*100/post_total,3))
#plot
fig, ax = plt.subplots(figsize=(10,3))
plt.bar(df1.index,df1.values)
#arrange
plt.ylabel('% of instances')
plt.xlabel('Emotion')
plt.title('Emotion distribution')
plt.grid(True)
plt.show()


In [None]:
# build analyzers (bag-of-words)
BOW_500 = CountVectorizer(max_features=3000, tokenizer=nltk.word_tokenize) 
# apply analyzer to training data
BOW_500.fit(train_df['text'])
train_data_BOW_features_500 = BOW_500.transform(train_df['text'])
## check dimension
train_data_BOW_features_500.shape

In [None]:
# for a classificaiton problem, you need to provide both training & testing data
X_train = BOW_500.transform(train_df['text'])
y_train = train_df['emotion']
X_test = BOW_500.transform(test_df['text'])
y_test = test_df['emotion']
 ## take a look at data dimension is a good habit  :)
print('X_train.shape: ', X_train.shape)
print('y_train.shape: ', y_train.shape)
print('X_test.shape: ', X_test.shape)
print('y_test.shape: ', y_test.shape)

In [None]:
## deal with label (string -> one-hot)

from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
label_encoder.fit(y_train)
print('check label: ', label_encoder.classes_)
print('\n## Before convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)

def label_encode(le, labels):
    enc = le.transform(labels)
    return keras.utils.to_categorical(enc)

def label_decode(le, one_hot_label):
    dec = np.argmax(one_hot_label, axis=1)
    return le.inverse_transform(dec)

y_train = label_encode(label_encoder, y_train)


print('\n\n## After convert')
print('y_train[0:4]:\n', y_train[0:4])
print('\ny_train.shape: ', y_train.shape)



In [None]:
from sklearn.model_selection import train_test_split

# 假设 X 是特征矩阵，y 是目标变量
# 数据集切分为训练集和验证集，验证集比例为 20%
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train , test_size=0.2, random_state=42)

# 查看数据集大小
print("训练集特征大小:", X_train.shape)
print("验证集特征大小:", X_val.shape)
print("训练集目标大小:", y_train.shape)
print("验证集目标大小:", y_val.shape)

In [None]:
# 假设 X_train 是特征矩阵，y_train 是 one-hot 编码的标签
input_dim = X_train.shape[1]  # 输入特征数量
output_dim = y_train.shape[1]  # 输出类别数量

model = Sequential([
    Dense(326, input_dim=input_dim, activation='relu'),  # 第一隐藏层
    Dropout(0.3),                                       # 防止过拟合
    Dense(128, activation='relu'),                       # 第二隐藏层
    Dropout(0.3),
    Dense(64, input_dim=input_dim, activation='relu'),
    Dropout(0.3),
    Dense(output_dim, activation='softmax')             # 输出层
])

# 编译模型
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# 训练模型
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),  # 如果有验证集
    epochs=30,                      # 训练轮数
    batch_size=32,                  # 每批样本数量
    verbose=1                       # 显示训练过程
)

In [None]:
# 预测测试集
predictions = model.predict(X_test)

# 获取预测的类别索引
predicted_indices = predictions.argmax(axis=1)

# 将类别索引转换回原始类别名称
predicted_labels = label_encoder.inverse_transform(predicted_indices)

# 查看部分结果
print("Predicted Labels:", predicted_labels[:10])

In [None]:
test_ids=test_df['tweet_id']
fin=pd.DataFrame({
    'id':test_ids,
    'emotion':predicted_labels
})
print(fin.head())
fin.to_csv('NN.csv',index=False)