In [1]:
import pandas as pd
import os
from PIL import Image
from torchvision import transforms
from torchvision.datasets import MNIST
from torch.utils.data import DataLoader, random_split, Dataset
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
def load_image(image_path):
    image = Image.open(image_path).convert('RGB')
    return image

In [3]:
def load_data(row):
    data_path = './data'
    guid = int(row['guid'])
    txt_file_path = os.path.join(data_path, f"{guid}.txt")
    img_file_path = os.path.join(data_path, f"{guid}.jpg")

    with open(txt_file_path, 'r', encoding='latin-1') as txt_file:
        txt_content = txt_file.read()

    img_tensor = load_image(img_file_path)

    return pd.Series({'guid': guid, 'txt': txt_content, 'fig': img_tensor, 'tag': row['tag']})

In [4]:
def document_vector(model, doc):
    vec = [model.wv[word] for word in doc if word in model.wv.index_to_key]
    return np.mean(vec, axis=0) if vec else np.zeros(model.vector_size)

In [5]:
train_path = './train.txt' 
df_train = pd.read_csv(train_path, sep=',')
df_train.head()

Unnamed: 0,guid,tag
0,4597,negative
1,26,neutral
2,4383,negative
3,212,positive
4,2626,positive


In [6]:
train_and_valid = df_train.apply(load_data, axis=1)
label_encoder = LabelEncoder()
train_and_valid['label'] = label_encoder.fit_transform(train_and_valid['tag'])
train_and_valid.head()

Unnamed: 0,guid,txt,fig,tag,label
0,4597,RT @AmitSwami77: The conspirators have an evil...,<PIL.Image.Image image mode=RGB size=598x303 a...,negative,0
1,26,"Waxwing trills, Chickadees calling ""here sweet...",<PIL.Image.Image image mode=RGB size=600x450 a...,neutral,1
2,4383,@NYSE is looking a little despondent today...?...,<PIL.Image.Image image mode=RGB size=599x337 a...,negative,0
3,212,"FERVENT | S,M,L | 140k free PLASTIC CLIP, keyc...",<PIL.Image.Image image mode=RGB size=600x923 a...,positive,2
4,2626,Nice day chilling in the park yesterday reliev...,<PIL.Image.Image image mode=RGB size=600x800 a...,positive,2


In [7]:
test_path = './test_without_label.txt' 
df_test = pd.read_csv(test_path, sep=',')
df_test.head()

Unnamed: 0,guid,tag
0,8,
1,1576,
2,2320,
3,4912,
4,3821,


In [8]:
test = df_test.apply(load_data, axis=1)
# label_encoder = LabelEncoder()
# train_and_valid['label'] = label_encoder.fit_transform(train_and_valid['tag'])
test.head()

Unnamed: 0,guid,txt,fig,tag
0,8,Energetic training today with our San Antonio ...,<PIL.Image.Image image mode=RGB size=599x447 a...,
1,1576,Let your voice be heard! 18+ #endsuicide #blit...,<PIL.Image.Image image mode=RGB size=600x424 a...,
2,2320,RT @Austin_Powers__: Shark Week would be so mu...,<PIL.Image.Image image mode=RGB size=444x360 a...,
3,4912,#TheTruthCaster http://t.co/S8jvqpKq5h\n,<PIL.Image.Image image mode=RGB size=600x600 a...,
4,3821,RT @jarpad: Hey #WBSDCC look what we're up to!...,<PIL.Image.Image image mode=RGB size=599x399 a...,


In [9]:
# 合并训练集和测试集的文本数据
combined_corpus = pd.concat([train_and_valid['txt'], test['txt']], axis=0)
# combined_corpus = combined_corpus.apply(lambda x: x.lower().split())

# 训练 Word2Vec 模型
# word2vec = Word2Vec(sentences=combined_corpus, vector_size=100, window=5, min_count=3, sg=1)
# word2vec.train(combined_corpus, total_examples=len(combined_corpus), epochs=10)

In [14]:
# train_and_valid['doc_vector'] = train_and_valid['txt'].apply(lambda x: document_vector(word2vec, x.split()))
# test['doc_vector'] = test['txt'].apply(lambda x: document_vector(word2vec, x.split()))
vocab_size = 1500
max_len = 50
tokenizer = Tokenizer()
tokenizer.fit_on_texts(combined_corpus)
train_seq = tokenizer.texts_to_sequences(train_and_valid['txt'])
test_seq = tokenizer.texts_to_sequences(test['txt'])
train_seq = pad_sequences(train_seq, maxlen=max_len, padding='post')
test_seq = pad_sequences(test_seq, maxlen=max_len, padding='post')
train_and_valid['docvec'] = [seq for seq in train_seq]
test['docvec'] = [seq for seq in test_seq]
train_and_valid.head()

Unnamed: 0,guid,txt,fig,tag,label,docvec
0,4597,RT @AmitSwami77: The conspirators have an evil...,<PIL.Image.Image image mode=RGB size=598x303 a...,negative,0,"[1, 5060, 5, 5061, 46, 48, 272, 960, 22, 66, 2..."
1,26,"Waxwing trills, Chickadees calling ""here sweet...",<PIL.Image.Image image mode=RGB size=600x450 a...,neutral,1,"[5065, 5066, 2860, 1611, 98, 2085, 179, 2861, ..."
2,4383,@NYSE is looking a little despondent today...?...,<PIL.Image.Image image mode=RGB size=599x337 a...,negative,0,"[5069, 12, 242, 6, 143, 560, 55, 2, 3, 4, 5070..."
3,212,"FERVENT | S,M,L | 140k free PLASTIC CLIP, keyc...",<PIL.Image.Image image mode=RGB size=600x923 a...,positive,2,"[725, 165, 273, 561, 5071, 192, 1612, 1103, 50..."
4,2626,Nice day chilling in the park yesterday reliev...,<PIL.Image.Image image mode=RGB size=600x800 a...,positive,2,"[395, 36, 2863, 9, 5, 619, 434, 5076, 16, 1298..."


In [15]:
test.head()

Unnamed: 0,guid,txt,fig,tag,docvec
0,8,Energetic training today with our San Antonio ...,<PIL.Image.Image image mode=RGB size=599x447 a...,,"[171, 707, 55, 21, 52, 1385, 2468, 28, 16282, ..."
1,1576,Let your voice be heard! 18+ #endsuicide #blit...,<PIL.Image.Image image mode=RGB size=600x424 a...,,"[237, 31, 1608, 24, 1277, 787, 16284, 4673, 31..."
2,2320,RT @Austin_Powers__: Shark Week would be so mu...,<PIL.Image.Image image mode=RGB size=444x360 a...,,"[1, 4543, 16287, 4589, 191, 247, 24, 23, 80, 3..."
3,4912,#TheTruthCaster http://t.co/S8jvqpKq5h\n,<PIL.Image.Image image mode=RGB size=600x600 a...,,"[16290, 2, 3, 4, 16291, 0, 0, 0, 0, 0, 0, 0, 0..."
4,3821,RT @jarpad: Hey #WBSDCC look what we're up to!...,<PIL.Image.Image image mode=RGB size=599x399 a...,,"[1, 16292, 593, 3753, 114, 59, 398, 44, 7, 152..."


In [13]:
max_token_len = 0
for i in range(len(train_and_valid)):
    max_token_len = max(max_token_len, len(train_and_valid['docvec'][i]))
for i in range(len(test)):
    max_token_len = max(max_token_len, len(test['docvec'][i]))
print(max_token_len)

33


In [20]:
class AlexNet(nn.Module):
    def __init__(self, outputdim=500, input_channels=3):
        super(AlexNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(input_channels, 96, kernel_size=11, stride=4, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(96, 256, kernel_size=5, padding=2),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
            nn.Conv2d(256, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2),
        )
        self.classifier = nn.Sequential(
            nn.Dropout(),
            nn.Linear(256 * 6 * 6, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, outputdim),
        )
    
    def forward(self, x):
        x = self.features(x)
        x = x.view(x.size(0), 256 * 6 * 6)
        x = self.classifier(x)
        return x
    
class TextRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, outputdim):
        super(TextRNN, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, outputdim)
        self.softmax = nn.Softmax(dim=1)

    def forward(self, x):
        embedded = self.embedding(x)
        lstm_out, _ = self.lstm(embedded)
        # 使用最后一个时间步的输出作为分类
        lstm_out = lstm_out[:, -1, :]
        output = self.fc(lstm_out)
        output = self.softmax(output)
        return output
    
class MultiModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, single_outputdim, hidden_size, num_classes):
        super(MultiModel, self).__init__()
        self.fig_model = AlexNet()
        self.txt_model = TextRNN(vocab_size, embedding_dim, hidden_size, single_outputdim)
        self.act = nn.ReLU(inplace=True)
        self.fc = nn.Linear(single_outputdim * 2, num_classes)
        
    def forward(self, fig_data, txt_data, mode):
        fig_out = self.fig_model(fig_data)
        txt_out = self.txt_model(txt_data)
        if mode == 1:
            fig_out *= 0
        elif mode == 2:
            txt_out *= 0
        concated_out = torch.concat((fig_out, txt_out), dim=1)
        out = self.fc(self.act(concated_out))
        return out


class MyDataset(Dataset):
    def __init__(self, dataframe, transform):
        self.data = dataframe
        self.transform = transform

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        img_data = self.data.iloc[index]['fig']
        txt_data = self.data.iloc[index]['docvec']
        label = self.data.iloc[index]['label']

        if self.transform:
            img_data = self.transform(img_data)

        return img_data, txt_data, label

# Preprocess your image data, if necessary
# Define your transformations (you might need to customize this based on your data)
transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
        ])

# Create datasets and loaders
# train_and_test = train_and_valid.drop(columns=['txt', 'guid'])

train_set, valid_set = train_test_split(train_and_valid, test_size=0.2, random_state=42)

train_dataset = MyDataset(train_set, transform=transform)
valid_dataset = MyDataset(valid_set, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size=64, shuffle=False)

# Instantiate your LeNet model
model = MultiModel(len(tokenizer.word_index) + 1, 100, 500, 128, 3)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()

    for img, txt, labels in train_loader:
        optimizer.zero_grad()
        outputs = model(img, txt, 0)
        labels = labels.long()
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

    # Validation
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for img, txt, labels in valid_loader:
            outputs = model(img, txt, 0)
            labels = labels.long()
            preds = torch.argmax(outputs, dim=1)
            all_preds.extend(preds.numpy())
            all_labels.extend(labels.numpy())
            val_loss = criterion(outputs, labels)

    accuracy = accuracy_score(all_labels, all_preds)
    print(f'Epoch {epoch + 1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}, Accuracy: {accuracy}')

# Save the trained model if needed
# torch.save(model.state_dict(), 'lenet_model.pth')

Epoch 1/10, Training Loss: 1.0551929473876953, Validation Loss: 1.0243786573410034, Accuracy: 0.60375
Epoch 2/10, Training Loss: 0.8408408164978027, Validation Loss: 1.0471746921539307, Accuracy: 0.60375
Epoch 3/10, Training Loss: 0.906380832195282, Validation Loss: 1.0424479246139526, Accuracy: 0.60375
Epoch 4/10, Training Loss: 0.8813992738723755, Validation Loss: 1.0387036800384521, Accuracy: 0.60375
Epoch 5/10, Training Loss: 1.0592097043991089, Validation Loss: 1.0187287330627441, Accuracy: 0.60375
Epoch 6/10, Training Loss: 0.8108882904052734, Validation Loss: 1.03138267993927, Accuracy: 0.60375
Epoch 7/10, Training Loss: 0.9195077419281006, Validation Loss: 1.03811514377594, Accuracy: 0.60375
Epoch 8/10, Training Loss: 0.8673166632652283, Validation Loss: 1.080847978591919, Accuracy: 0.60375
Epoch 9/10, Training Loss: 0.8722095489501953, Validation Loss: 1.0589804649353027, Accuracy: 0.60375
Epoch 10/10, Training Loss: 0.9523080587387085, Validation Loss: 1.1240830421447754, Acc