In [1]:
!pip install transformers
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import transformers
from transformers import pipeline



In [2]:
def create_output_string(row):
    formatted_strings = []

    for column, value in row.items():
        if pd.isna(value):
            value = ""

        formatted_strings.append(f"The {column} is {value}")

    output_string = " and ".join(formatted_strings)
    return output_string

In [3]:
def preprocessing():
        df = pd.read_excel('R MLS.xls')
        df.columns=df.iloc[0]
        df.drop(0,inplace=True)
        df = df.drop_duplicates()
        threshold=0.67
        null_percentages = df.isnull().mean()
        columns_to_drop = null_percentages[null_percentages > threshold].index
        df = df.drop(columns=columns_to_drop)
        numerical_columns = []

        for column in df.columns:
            if pd.to_numeric(df[column], errors='coerce').notnull().all():
                numerical_columns.append(column)

        df[numerical_columns] = df[numerical_columns].apply(pd.to_numeric, errors='coerce')
        categorical_cols = df.select_dtypes(include='object').columns
        numerical_cols = df.select_dtypes(include=['int', 'float']).columns


        for i in numerical_cols:
            df[i].fillna(df[i].median(), inplace=True)

        for i in categorical_cols:
            df[i].fillna(df[i].mode()[0], inplace=True)
        formatted_strings = []

        output_df = pd.DataFrame(df.apply(create_output_string, axis=1), columns=['Output'])
        return output_df

# New section

In [4]:
preprocessing().head()

Unnamed: 0,Output
1,The List Number is 6517305 and The Agency Name...
2,The List Number is 6564997 and The Agency Name...
3,The List Number is 6509227 and The Agency Name...
4,The List Number is 6567402 and The Agency Name...
5,The List Number is 6497979 and The Agency Name...


In [5]:
from transformers import pipeline

In [6]:
output_df=preprocessing()
output_df.iloc[0]

Output    The List Number is 6517305 and The Agency Name...
Name: 1, dtype: object

In [8]:
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, AdamW

# Load and preprocess the data
data = output_df[0:2]

data

Unnamed: 0,Output
1,The List Number is 6517305 and The Agency Name...
2,The List Number is 6564997 and The Agency Name...


In [9]:
# Prepare data for fine-tuning
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [10]:
# Add the padding token manually
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
input_ids = tokenizer.batch_encode_plus(
    data['Output'].tolist(),
    padding=True,
    truncation=True,
    return_tensors='pt'
)['input_ids']


In [11]:
# Fine-tune the GPT-2 model
config = GPT2Config.from_pretrained('gpt2')
model = GPT2LMHeadModel(config)
model.resize_token_embeddings(len(tokenizer))


Embedding(50258, 768)

In [12]:
# Set up fine-tuning parameters and optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)
loss_fn = torch.nn.CrossEntropyLoss()



In [13]:
# Fine-tune the model
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.train()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50258, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50258, bias=False)
)

In [14]:
num_epochs = 3
for epoch in range(num_epochs):
    for input_batch in input_ids:
        input_batch = input_batch.to(device)
        labels = input_batch.clone()  # Use the same input as the label for language modeling

        outputs = model(input_batch, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [15]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_model")

In [17]:
# Chatbot Inference
def chatbot_inference(user_query):
    # Preprocess user_query and tokenize
    input_ids = tokenizer.encode(user_query, return_tensors='pt').to(device)
    # Generate response with attention mask
    with torch.no_grad():
        output_ids = model.generate(input_ids, max_length=20, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    response = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    return response

# Example usage:
user_query = "What is the price of the property with List Number 6517305?"
response = chatbot_inference(user_query)
print(response)

What is the price of the property with List Number 6517305? and and and and The and


In [18]:
# Load the pre-trained model and tokenizer
model = GPT2LMHeadModel.from_pretrained("gpt2")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

# Function to generate response given an input text
def generate_response(input_text, max_length=50):
    # Tokenize the input text
    inputs = tokenizer(input_text, return_tensors="pt", max_length=max_length, padding=True)

    # Generate the output using the model
    with torch.no_grad():
        output = model.generate(**inputs)

    # Decode the generated output
    response = tokenizer.decode(output[0], skip_special_tokens=True)

    return response

# Example input
input_text = "What is the price of the property with List Number 6517305?"

# Generate response
response = generate_response(input_text)

# Print the response
print("Response:", response)








Using pad_token, but it is not set yet.


ValueError: ignored