In [2]:
import os
import pandas as pd
import numpy as np
from PIL import Image

In [3]:
def process_text(text):
    return " ".join(word for word in text.replace("#", "").split(" ") 
                    if len(word) > 0 and not (len(word) >= 4 and 'http' in word) and word[0] != '@')

In [4]:
data_dir = "./data/data"
def process_train_row(row):
    guid = int(row['guid'])
    tag = row['tag']
    img_path = os.path.join(data_dir, f"{guid}.jpg")
    txt_path = os.path.join(data_dir, f"{guid}.txt")
    # print(img_path)
    img = Image.open(img_path).resize((224, 224), Image.Resampling.LANCZOS)
    img = np.asarray(img, dtype='float32')

    with open(txt_path, encoding='gb18030') as f:
        text = f.read()
        
    return {'guid': guid, 'text': process_text(text), 'tag': tag_to_num[tag]}

In [5]:
def process_test_row(row):
    guid = int(row['guid']) # 防止路径中出现小数点
    img_path = os.path.join(data_dir, f"{guid}.jpg")
    txt_path = os.path.join(data_dir, f"{guid}.txt")
    # print(img_path)
    img = Image.open(img_path).resize((224, 224), Image.Resampling.LANCZOS)
    img = np.asarray(img, dtype='float32')

    with open(txt_path, encoding='gb18030') as f: # 指定编码方式为gb18030，不然会报错
        text = f.read()

    return {'guid': guid, 'text': process_text(text)}

In [6]:
tag_to_num = {
    'positive' : 2,
    'neutral' : 1,
    'negative' : 0,
}

In [7]:
train_dataframe = pd.read_csv("./data/train.txt")
test_dataframe = pd.read_csv("./data/test_without_label.txt")

In [8]:
train_data_list = train_dataframe.apply(process_train_row, axis=1).tolist()
test_data_list = test_dataframe.apply(process_test_row, axis=1).tolist()

df1 = pd.DataFrame(train_data_list)
df2 = pd.DataFrame(test_data_list)

df1.to_csv("./data/text_tag_train.csv" , index=False)
df2.to_csv("./data/text_test.csv" , index=False)