In [1]:
import os
import tensorflow as tf
import json
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, classification_report
from sklearn.metrics import confusion_matrix
# from tensorflow.keras.models import Model
# from tensorflow.keras.layers import Input, Embedding, Conv1D, Dense, LSTM, Dropout, Bidirectional, MaxPooling1D, GlobalAveragePooling1D, AdditiveAttention, SpatialDropout1D
# from tensorflow.keras.optimizers import Adam
# from tensorflow.keras.layers import AdditiveAttention
# from tensorflow.keras.callbacks import EarlyStopping




In [17]:
url_data = 'https://raw.githubusercontent.com/TeodorRusKvi/Tekstanalyse/main/git_NLP_data/'

# Last inn 'X_train_LSTM' fra en CSV-fil
X_df = pd.read_csv(url_data+'X_tensorflow.csv')
# Konverter hele DataFrame til et NumPy array
X = X_df.to_numpy()

# Last inn 'y_train_LSTM' fra en CSV-fil
y_df = pd.read_csv(url_data+'y_liberal.csv')
# Konverter hele DataFrame til et NumPy array
y = y_df.to_numpy().flatten()

# Last inn 'y_train_LSTM' fra en CSV-fil
embeddings_GloVe = pd.read_csv(url_data+'embeddings_glove.csv')
# Konverter hele DataFrame til et NumPy array
embeddings_GloVe = embeddings_GloVe.to_numpy()


# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42)

In [19]:
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader, Dataset

# Assuming X and y are your tokenized data and labels

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.5, random_state=42, shuffle=True, stratify=y_test)

# Custom Dataset Class
class TextDataset(Dataset):
    def __init__(self, texts, labels, embeddings, max_length=100):
        self.texts = texts
        self.labels = labels
        self.embeddings = embeddings
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = self.texts[idx]
        label = self.labels[idx]
        
        embedding = np.zeros((self.max_length, 100))  # Assuming GloVe embeddings of size 300
        for i, token in enumerate(text):
            if i >= self.max_length:
                break
            if token in self.embeddings:
                embedding[i] = self.embeddings[token]
        
        return torch.tensor(embedding, dtype=torch.float32), torch.tensor(label, dtype=torch.long)

# Initialize datasets and dataloaders
train_dataset = TextDataset(X_train, y_train, embeddings_GloVe)
val_dataset = TextDataset(X_val, y_val, embeddings_GloVe)
test_dataset = TextDataset(X_test, y_test, embeddings_GloVe)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

# Define the LSTM model
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super(LSTMClassifier, self).__init__()
        self.lstm = nn.LSTM(input_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, hidden):
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out[:, -1, :])  # Use the last output of LSTM
        return out, hidden

input_dim = 100  # GloVe embedding size
hidden_dim = 128  # LSTM hidden units
output_dim = 2  # Liberal or conservative
learning_rate = 0.001
num_epochs = 10

model = LSTMClassifier(input_dim, hidden_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Training function
def train_model(train_loader, model, criterion, optimizer, num_epochs):
    model.train()
    for epoch in range(num_epochs):
        total_loss = 0
        for texts, labels in train_loader:
            optimizer.zero_grad()
            hidden = (torch.zeros(1, texts.size(0), hidden_dim),
                      torch.zeros(1, texts.size(0), hidden_dim))
            outputs, _ = model(texts, hidden)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()
        print(f'Epoch {epoch+1}/{num_epochs}, Loss: {total_loss/len(train_loader)}')

# Train the model
train_model(test_loader, model, criterion, optimizer, num_epochs)

Epoch 1/10, Loss: 0.6557265744834649
Epoch 2/10, Loss: 0.6500714401729771
Epoch 3/10, Loss: 0.6500520491209186
Epoch 4/10, Loss: 0.6500492936275044
Epoch 5/10, Loss: 0.6500347686595604
Epoch 6/10, Loss: 0.650027765602362
Epoch 7/10, Loss: 0.6500226181061541
Epoch 8/10, Loss: 0.650016764148337
Epoch 9/10, Loss: 0.6500046702681995
Epoch 10/10, Loss: 0.6499871523653875
