In [1]:
pip install transformers



In [2]:
import pandas as pd
from transformers import GPT2Tokenizer

# Load your tabular data (replace with your own)
data = {
    "Name": ["John", "Alice", "Bob"],
    "Age": [25, 30, 22],
    "City": ["New York", "San Francisco", "Los Angeles"],
}


df = pd.DataFrame(data)

# Convert tabular data to text
text_data = []
for index, row in df.iterrows():
    row_text = " ".join([f"[{col}]: {str(row[col])}" for col in df.columns])
    text_data.append(row_text)


In [27]:
data

{'Name': ['John', 'Alice', 'Bob'],
 'Age': [25, 30, 22],
 'City': ['New York', 'San Francisco', 'Los Angeles']}

In [39]:
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Alice,30,San Francisco
2,Bob,22,Los Angeles


In [3]:
text_data

['[Name]: John [Age]: 25 [City]: New York',
 '[Name]: Alice [Age]: 30 [City]: San Francisco',
 '[Name]: Bob [Age]: 22 [City]: Los Angeles']

In [4]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from torch.utils.data import DataLoader

# Load the GPT-2 tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-large")

# Convert tabular data to text
text_data = []
for index, row in df.iterrows():
    row_text = " ".join([f"[{col}]: {str(row[col])}" for col in df.columns])
    text_data.append(row_text)

# Tokenize the tabular data
tokenized_data = [tokenizer.encode(row, return_tensors="pt") for row in text_data]

# DataLoader for training
dataloader = DataLoader(tokenized_data, batch_size=2, shuffle=True)

# Example fine-tuning function
def fine_tune_gpt2(model_name, dataloader, output_dir):
    # Load GPT-2 model
    model = GPT2LMHeadModel.from_pretrained(model_name)

    # Fine-tuning setup...
    # (You can use the provided fine-tuning function or customize it based on your specific needs)

    # Example training loop
    for epoch in range(5):
        for inputs in dataloader:
            model.train()
            # Your training code here...
            # (This is where you update the model with your training data)

    # Save the fine-tuned model
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)

# Example usage
fine_tune_gpt2("gpt2-large", dataloader, "output")



vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/666 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.25G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [15]:
for i in dataloader:
  print(i)

tensor([[[   58,  5376,  5974,  5811,   685, 23396,  5974,  2534,   685, 14941,
           5974,  5401,  5652]],

        [[   58,  5376,  5974, 14862,   685, 23396,  5974,  1542,   685, 14941,
           5974,  2986,  6033]]])
tensor([[[   58,  5376,  5974,  1757,   685, 23396,  5974,  1679,   685, 14941,
           5974,   968,  1971]]])


In [7]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model_name = "output"  # Replace with the actual path
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)


In [35]:
def generate_text(prompt, max_length=100, temperature=0.7, top_k=100):
    input_ids = tokenizer.encode(prompt, return_tensors="pt")
    output = model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_k=top_k,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True
    )
    generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_text

In [36]:
prompt = "User: [Name]: John [Age]: 25 [City]: New York,User: [Name]: Alice [Age]: 30 [City]: San Francisco,User: [Name]: Bob [Age]: 22 [City]: Los Angeles"
generated_text = generate_text(prompt, max_length=120)
print(generated_text)

User: [Name]: John [Age]: 25 [City]: New York,User: [Name]: Alice [Age]: 30 [City]: San Francisco,User: [Name]: Bob [Age]: 22 [City]: Los Angeles,User: [Name]: Bill [Age]: 22 [City]: New York,User: [Name]: Betty [Age]: 21 [City]: Brooklyn,User: [Name]: Danny [Age]: 17 [City]: New York,User: [Name]: Eric [Age]: 17 [City]: New York,User: [Name]: Freddie [Age]:


In [37]:
x = generated_text.split(',')


In [38]:
x

['User: [Name]: John [Age]: 25 [City]: New York',
 'User: [Name]: Alice [Age]: 30 [City]: San Francisco',
 'User: [Name]: Bob [Age]: 22 [City]: Los Angeles',
 'User: [Name]: Bill [Age]: 22 [City]: New York',
 'User: [Name]: Betty [Age]: 21 [City]: Brooklyn',
 'User: [Name]: Danny [Age]: 17 [City]: New York',
 'User: [Name]: Eric [Age]: 17 [City]: New York',
 'User: [Name]: Freddie [Age]:']

In [47]:
x.pop(7)

'User: [Name]: Freddie [Age]:'

In [48]:
x

['User: [Name]: John [Age]: 25 [City]: New York',
 'User: [Name]: Alice [Age]: 30 [City]: San Francisco',
 'User: [Name]: Bob [Age]: 22 [City]: Los Angeles',
 'User: [Name]: Bill [Age]: 22 [City]: New York',
 'User: [Name]: Betty [Age]: 21 [City]: Brooklyn',
 'User: [Name]: Danny [Age]: 17 [City]: New York',
 'User: [Name]: Eric [Age]: 17 [City]: New York']

In [54]:
import re
pattern = r"User: \[Name\]: ([\w\s]+) \[Age\]: (\d+) \[City\]: ([\w\s]+)"
matches = re.findall(pattern,generated_text)


In [55]:
df = pd.DataFrame(matches, columns=["Name", "Age", "City"])

# Converting columns to appropriate data types
df["Age"] = df["Age"].astype(int)

In [56]:
df

Unnamed: 0,Name,Age,City
0,John,25,New York
1,Alice,30,San Francisco
2,Bob,22,Los Angeles
3,Bill,22,New York
4,Betty,21,Brooklyn
5,Danny,17,New York
6,Eric,17,New York
