In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences




# Load dataset
def load_dataset(filename):
    df= pd.read_csv('Data/en-zu.training.csv', sep=',')
    print(df)
    return df

# Preprocessing
def preprocess(df, num_samples=None):
    if num_samples:
        df = df[:num_samples]

    df = shuffle(df)
    # Add <start> and <end> tokens to target sequences
    # print(df['zu'])
    df['zu'] = df['zu'].apply(lambda x: ',' + x + '\n')
    return df

# Create tokenizer
def create_tokenizer(data):
    tokenizer = Tokenizer(filters='', oov_token='<OOV>')
    tokenizer.fit_on_texts(data)
    return tokenizer

# Prepare data
def prepare_data(df, num_samples=None):
    df = preprocess(df, num_samples)
    input_data = df['en'].values
    target_data = df['zu'].values

    input_tokenizer = create_tokenizer(input_data)
    target_tokenizer = create_tokenizer(target_data)

    input_seq = input_tokenizer.texts_to_sequences(input_data)
    target_seq = target_tokenizer.texts_to_sequences(target_data)

    input_seq = pad_sequences(input_seq, padding='post')
    target_seq = pad_sequences(target_seq, padding='post')

    return input_seq, target_seq, input_tokenizer, target_tokenizer

# Define encoder-decoder model
def build_model(input_vocab_size, target_vocab_size, enc_seq_len, dec_seq_len, latent_dim):
    # Encoder
    encoder_inputs = Input(shape=(enc_seq_len,))
    encoder_embedding = Embedding(input_vocab_size, latent_dim, mask_zero=True)(encoder_inputs)
    encoder_lstm = LSTM(latent_dim, return_state=True)
    _, state_h, state_c = encoder_lstm(encoder_embedding)
    encoder_states = [state_h, state_c]

    # Decoder
    decoder_inputs = Input(shape=(None,))
    decoder_embedding = Embedding(target_vocab_size, latent_dim, mask_zero=True)(decoder_inputs)
    decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state=True)
    decoder_outputs, _, _ = decoder_lstm(decoder_embedding, initial_state=encoder_states)
    decoder_dense = Dense(target_vocab_size, activation='softmax')
    decoder_outputs = decoder_dense(decoder_outputs)

    model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
    return model

# Train model
def train_model(model, X_train, y_train, X_val, y_val, batch_size, epochs):
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    model.fit([X_train, y_train[:, :-1]], y_train[:, 1:],
              validation_data=([X_val, y_val[:, :-1]], y_val[:, 1:]),
              batch_size=batch_size, epochs=epochs)

# Prediction
def predict_sequence(input_seq, input_tokenizer, target_tokenizer, model, max_decoder_seq_len):
    state_h, state_c = encoder_model.predict(input_seq)
    target_seq = np.zeros((1, 1))
    target_seq[0, 0] = target_tokenizer.word_index['\t']

    translated_sentence = ''
    stop_condition = False
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + [state_h, state_c])
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_word_index[sampled_token_index]
        translated_sentence += sampled_char

        if sampled_char == '\n' or len(translated_sentence) > max_decoder_seq_len:
            stop_condition = True

        target_seq = np.zeros((1, 1))
        target_seq[0, 0] = sampled_token_index
        state_h, state_c = h, c

    return translated_sentence

# Load dataset
dataset = load_dataset('Data/en-zu.training.csv')  # Provide path to your dataset

# Split dataset into train and validation sets
train_df, val_df = train_test_split(dataset, test_size=0.1)

# Prepare training and validation data
X_train, y_train, input_tokenizer, target_tokenizer = prepare_data(train_df)
X_val, y_val, _, _ = prepare_data(val_df)

# Define model parameters
latent_dim = 256
enc_seq_len = X_train.shape[1]
dec_seq_len = y_train.shape[1]
input_vocab_size = len(input_tokenizer.word_index) + 1
target_vocab_size = len(target_tokenizer.word_index) + 1

# Build model
model = build_model(input_vocab_size, target_vocab_size, enc_seq_len, dec_seq_len, latent_dim)

# Train model
batch_size = 64
epochs = 100
train_model(model, X_train, y_train, X_val, y_val, batch_size, epochs)

# Inference setup
encoder_model = Model(encoder_inputs, encoder_states)

decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embedding, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]

decoder_outputs = decoder_dense(decoder_outputs)

decoder_model = Model(
    [decoder_inputs] + decoder_states_inputs,
    [decoder_outputs] + decoder_states
)

# Reverse-lookup token index to decode sequences back to something readable
reverse_target_word_index = {v: k for k, v in target_tokenizer.word_index.items()}

# Make predictions
for i in range(5):  # Make predictions for 5 random sentences
    index = np.random.randint(0, len(X_val))
    input_seq = X_val[index:index+1]
    translated_sentence = predict_sequence(input_seq, input_tokenizer, target_tokenizer, model, dec_seq_len)
    print('Input sentence:', val_df['en'].values[index])
    print('Translated sentence:', translated_sentence)


In [None]:
df = pd.read_csv('Data/en-zu.training.csv', sep=',')
df

In [23]:
# Import necessary libraries
from datasets import load_dataset
from transformers import MarianMTModel, MarianTokenizer, MarianMTForConditionalGeneration, MarianConfig, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load your dataset
dataset = load_dataset('csv', data_files='Data/en-zu.training.csv')

# Load pre-trained model and tokenizer
model_name = "Helsinki-NLP/opus-mt-en-zu"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTForConditionalGeneration.from_pretrained(model_name)

# Prepare your dataset
# Preprocess your data and tokenize using the tokenizer provided by the pre-trained model

# Define training arguments
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    predict_with_generate=True,
)

# Fine-tune the model
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)
trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print(results)

# Save the fine-tuned model
trainer.save_model("fine_tuned_model")


RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

ImportError: numpy.core.multiarray failed to import