In [1]:
# %pip install transformers torch datasets
# %pip install accelerate -U
# %pip install sentencepiece


In [2]:
import json
import torch
from transformers import BertTokenizer, BertForSequenceClassification
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# Function to parse text lines and generate JSON
def generate_json_from_text(lines):
    json_data = []
    
    for line in lines:
        parts = line.split(" ")
        date = parts[0]
        time = parts[1]
        path = parts[3]
        fast_path = parts[5]
        
        # Create dictionary for each entry
        entry = {
            "date": date,
            "time": time,
            "path": path,
            "fast_path": fast_path
        }
        
        json_data.append(entry)
    
    return json_data

# Read from the text file
with open('/Users/prabirkalwani/cd_log.txt', 'r') as file:
    text_lines = file.readlines()

# Strip any extra spaces/newlines from each line
text_lines = [line.strip() for line in text_lines]

# Generate JSON from text lines
json_output = generate_json_from_text(text_lines)

# Convert to JSON string
json_formatted_str = json.dumps(json_output, indent=4)

# Save the JSON output to a file (optional)
with open('output.json', 'w') as json_file:
    json_file.write(json_formatted_str)

# Print the JSON output (for debugging)
print(json_formatted_str)


[
    {
        "date": "2024-09-04",
        "time": "21:59:05",
        "path": "/Users/prabirkalwani",
        "fast_path": "prabir"
    },
    {
        "date": "2024-09-04",
        "time": "21:59:10",
        "path": "/Users/prabirkalwani",
        "fast_path": "Data"
    },
    {
        "date": "2024-09-04",
        "time": "21:59:13",
        "path": "/Users/prabirkalwani/Data",
        "fast_path": "Programming"
    },
    {
        "date": "2024-09-04",
        "time": "21:59:17",
        "path": "/Users/prabirkalwani/Data/Programming",
        "fast_path": "Local_Models"
    },
    {
        "date": "2024-09-04",
        "time": "21:59:20",
        "path": "/Users/prabirkalwani/Data/Programming/Local_Models",
        "fast_path": ".."
    },
    {
        "date": "2024-09-04",
        "time": "21:59:21",
        "path": "/Users/prabirkalwani/Data/Programming",
        "fast_path": ".."
    },
    {
        "date": "2024-09-04",
        "time": "21:59:22",
        "path": "/

In [4]:

# Load data
with open('output.json', 'r') as f:
    data = json.load(f)

# Extract inputs and targets
inputs = [entry['fast_path'] for entry in data]
targets = [entry['path'] for entry in data]

# Save processed data (optional)
with open('processed_data.json', 'w') as f:
    json.dump({"inputs": inputs, "targets": targets}, f, indent=4)


In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch.utils.data import Dataset, DataLoader

class PathDataset(Dataset):
    def __init__(self, inputs, targets, tokenizer, max_len=128):
        self.inputs = inputs
        self.targets = targets
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.inputs)

    def __getitem__(self, idx):
        input_text = self.inputs[idx]
        target_text = self.targets[idx]
        
        inputs = self.tokenizer(input_text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        targets = self.tokenizer(target_text, padding='max_length', truncation=True, max_length=self.max_len, return_tensors='pt')
        
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': targets['input_ids'].squeeze(0)
        }


In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments

# Load processed data
with open('processed_data.json', 'r') as f:
    processed_data = json.load(f)

inputs = processed_data['inputs']
targets = processed_data['targets']

# Initialize tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")

# Create dataset and dataloader
dataset = PathDataset(inputs, targets, tokenizer)
dataloader = DataLoader(dataset, batch_size=8, shuffle=True)

# Initialize model
model = T5ForConditionalGeneration.from_pretrained("t5-small")

# Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    save_steps=10_000,
    save_total_limit=2,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset
)

# Train the model
trainer.train()

# Save the model
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")
print("Model saved successfully!")


You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
100%|██████████| 12/12 [00:06<00:00,  1.84it/s]

{'train_runtime': 6.5231, 'train_samples_per_second': 14.257, 'train_steps_per_second': 1.84, 'train_loss': 12.465843200683594, 'epoch': 3.0}
Model saved successfully!





In [11]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load the model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("fine_tuned_model")
model = T5ForConditionalGeneration.from_pretrained("fine_tuned_model")
model.eval()

def predict_path(fast_path, model, tokenizer):
    inputs = tokenizer(fast_path, return_tensors='pt', padding='max_length', truncation=True)
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        predicted_path = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_path

# Example prediction
fast_path = "pra"
predicted_path = predict_path(fast_path, model, tokenizer)
print(f"Predicted Path for '{fast_path}': {predicted_path}")


Predicted Path for 'pra': pra


In [12]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
import torch

# Example function to encode paths
def encode_paths(paths, tokenizer, max_len=128):
    return tokenizer(paths, padding='max_length', truncation=True, max_length=max_len, return_tensors='pt')

# Example data
paths = [
    {"input": "prabir", "target": "/Users/prabirkalwani"},
    {"input": "projects", "target": "/Users/prabirkalwani/projects"},
    {"input": "Data", "target": "/Users/prabirkalwani/Data"}
]

# Prepare inputs and targets
inputs = [path["input"] for path in paths]
targets = [path["target"] for path in paths]

# Tokenize
tokenizer = T5Tokenizer.from_pretrained("t5-small")
input_encodings = encode_paths(inputs, tokenizer)
target_encodings = encode_paths(targets, tokenizer)

# Custom Dataset
class PathDataset(Dataset):
    def __init__(self, encodings, targets_encodings):
        self.encodings = encodings
        self.targets_encodings = targets_encodings

    def __getitem__(self, idx):
        return {
            'input_ids': self.encodings['input_ids'][idx],
            'attention_mask': self.encodings['attention_mask'][idx],
            'labels': self.targets_encodings['input_ids'][idx]
        }

    def __len__(self):
        return len(self.encodings['input_ids'])

dataset = PathDataset(input_encodings, target_encodings)


In [13]:
from transformers import Trainer, TrainingArguments

model = T5ForConditionalGeneration.from_pretrained("t5-small")

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=10_000,
    save_total_limit=2,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)

trainer.train()
model.save_pretrained("fine_tuned_model")
tokenizer.save_pretrained("fine_tuned_model")


100%|██████████| 3/3 [00:01<00:00,  1.64it/s]

{'train_runtime': 1.8244, 'train_samples_per_second': 4.933, 'train_steps_per_second': 1.644, 'train_loss': 13.620394388834635, 'epoch': 3.0}





('fine_tuned_model/tokenizer_config.json',
 'fine_tuned_model/special_tokens_map.json',
 'fine_tuned_model/spiece.model',
 'fine_tuned_model/added_tokens.json')

In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

# Load model and tokenizer
tokenizer = T5Tokenizer.from_pretrained("t5-small")
model = T5ForConditionalGeneration.from_pretrained("fine_tuned_model")

device = torch.device("mps" if torch.backends.mps.is_available() else "cpu")
model.to(device)

def predict_path(fast_path, model, tokenizer):
    inputs = tokenizer(fast_path, return_tensors='pt', padding='max_length', truncation=True).to(device)
    with torch.no_grad():
        outputs = model.generate(input_ids=inputs['input_ids'], attention_mask=inputs['attention_mask'])
        predicted_path = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return predicted_path

# Example prediction
fast_path = "prabir"
predicted_path = predict_path(fast_path, model, tokenizer)
print(f"Predicted Path for '{fast_path}': {predicted_path}")


RuntimeError: Placeholder storage has not been allocated on MPS device!