In [None]:
from google.colab import files
import docx
import random
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from datasets import Dataset
import torch
from huggingface_hub import login
from transformers import TrainingArguments, Trainer

# Step 1: Log in using Hugging Face API token
login(token="enter your token")

# Step 2: Upload your Word file
uploaded = files.upload()

# Function to extract text from the Word document
def extract_text_from_docx(doc_path):
    doc = docx.Document(doc_path)
    text = []
    for para in doc.paragraphs:
        if para.text.strip():  # Ignore empty paragraphs
            text.append(para.text.strip())
    return text

# Get the uploaded file name
doc_path = list(uploaded.keys())[0]  # Get the first uploaded file name
dataset = extract_text_from_docx(doc_path)

# Step 3: Split the Dataset into 80-20 Ratio
def split_dataset(data, train_ratio=0.8):
    random.shuffle(data)
    split_index = int(len(data) * train_ratio)
    return data[:split_index], data[split_index:]

train_data, test_data = split_dataset(dataset)

# Save the train and test data (optional)
train_file_path = '/content/train_data.txt'
test_file_path = '/content/test_data.txt'
with open(train_file_path, 'w') as f:
    f.writelines(f"{item}\n" for item in train_data)
with open(test_file_path, 'w') as f:
    f.writelines(f"{item}\n" for item in test_data)

# Step 4: Load the tokenizer and model (using a smaller model to reduce memory usage)
model_name = "distilgpt2"  # Using a smaller model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Ensure the tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
    model.resize_token_embeddings(len(tokenizer))

# Step 5: Prepare the Dataset for Fine-Tuning
def tokenize_data(file_path, tokenizer):
    with open(file_path, 'r') as f:
        lines = f.readlines()
    inputs = tokenizer(
        lines,
        return_tensors="pt",
        padding=True,
        truncation=True,
        max_length=256  # Reduce max_length to save memory
    )
    inputs['labels'] = inputs['input_ids'].clone()  # Set labels for causal LM
    return inputs

train_inputs = tokenize_data(train_file_path, tokenizer)
test_inputs = tokenize_data(test_file_path, tokenizer)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_dict({
    'input_ids': train_inputs['input_ids'],
    'attention_mask': train_inputs['attention_mask'],
    'labels': train_inputs['labels']
})
test_dataset = Dataset.from_dict({
    'input_ids': test_inputs['input_ids'],
    'attention_mask': test_inputs['attention_mask'],
    'labels': test_inputs['labels']
})

# Check if CUDA is available and set the device accordingly
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Step 6: Set up TrainingArguments for standard fine-tuning
training_args = TrainingArguments(
    output_dir="/content/gpt2_finetuned",  # Directory for model output
    per_device_train_batch_size=1,  # Reduce batch size to fit in memory
    per_device_eval_batch_size=1,  # Reduce batch size to fit in memory
    gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps
    num_train_epochs=5,  # Number of epochs
    logging_dir="/content/logs",  # Directory for logging
    evaluation_strategy="epoch",  # Evaluation strategy
    save_steps=500,  # Save the model every 500 steps
    save_total_limit=2,  # Keep only the last 2 checkpoints
    fp16=True,  # Enable mixed-precision training
)

# Step 7: Define the function for generating answers
def generate_answer(question):
    inputs = tokenizer(question, return_tensors="pt").to(device)
    output = model.generate(
        inputs['input_ids'],
        max_length=150,
        num_beams=5,
        no_repeat_ngram_size=2,
        temperature=0.5,
        top_p=0.9,
        top_k=50,
        early_stopping=True
    )
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Step 8: Test Before Fine-Tuning
new_question = "Question: What did the court conclude about the funds in the husband’s premier accounts in FCMC 173/2015?"
print("Testing the pretrained model...")
pre_finetuning_answer = generate_answer(new_question)
print("Answer Before Fine-Tuning:", pre_finetuning_answer)

# Step 9: Fine-tune the model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)
trainer.train()

# Step 10: Test After Fine-Tuning
print("Testing the fine-tuned model...")
post_finetuning_answer = generate_answer(new_question)
print("Answer After Fine-Tuning:", post_finetuning_answer)

# Step 11: Save the fine-tuned model
trainer.save_model("/content/gpt2_finetuned")
