In [2]:
import pandas as pd
from transformers import pipeline
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from transformers import BertForTokenClassification,Trainer, TrainingArguments
import torch

  from .autonotebook import tqdm as notebook_tqdm


# Load dataset

In [3]:
train_data =pd.read_csv('atis_train.csv')
train_text = train_data['text'].tolist()
train_labels = train_data['intent'].tolist()

In [4]:
test_data = pd.read_csv('atis_test.csv')
test_text = test_data['text'].tolist()
test_labels =test_data['intent'].tolist()

In [5]:
print(train_data.head())
print(test_data.head())

   id       intent                                               text  \
0   0       flight  i want to fly from boston at 838 am and arrive...   
1   1       flight  what flights are available from pittsburgh to ...   
2   2  flight_time  what is the arrival time in san francisco for ...   
3   3      airfare            cheapest airfare from tacoma to orlando   
4   4      airfare  round trip fares from pittsburgh to philadelph...   

                                               slots  
0  O O O O O B-fromloc.city_name O B-depart_time....  
1  O O O O O B-fromloc.city_name O B-toloc.city_n...  
2  O O O B-flight_time I-flight_time O B-fromloc....  
3  B-cost_relative O O B-fromloc.city_name O B-to...  
4  B-round_trip I-round_trip O O B-fromloc.city_n...  
   id   intent                                               text  \
0   0   flight  i would like to find a flight from charlotte t...   
1   1  airfare  on april first i need a ticket from tacoma to ...   
2   2   flight  on april

# combine dataset

In [6]:
combined_data = pd.concat([train_data[['text', 'intent', 'slots']],
                            test_data[['text', 'intent']]])

In [7]:
# Split the combined data into train and test sets (adjust test_size as needed)
train_set, test_set = train_test_split(combined_data, test_size=0.2, random_state=42)


 # Select samples relevant to return date and time information

In [8]:

return_date_time_samples = combined_data[combined_data['intent'].isin
                                         (['flight_time', 'airfare'])]

# Initialize BERT tokenizer

In [9]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize and label data

In [10]:
tokenized_texts = []
labels = []

for text in return_date_time_samples['text']:
    encoded_input = tokenizer.encode_plus(text, add_special_tokens=True, return_tensors='pt', return_attention_mask=True)
    tokens = tokenizer.convert_ids_to_tokens(encoded_input['input_ids'][0])
    tokenized_texts.append(tokens)
    
    labels_per_text = ['O'] * len(tokens)
    # Logic to identify and label return date and time entities
    # Modify this part to suit your specific entity labeling requirements
    
    labels.append(labels_per_text)

# data frame with tokenized text and label

In [11]:
preprocessed_data = pd.DataFrame({'text_tokens': tokenized_texts, 'labels': labels})

# split the preprocessed data

In [12]:
train_set, test_set = train_test_split(preprocessed_data, test_size=0.2, random_state=42)

# Save preprocessed data

In [13]:
train_set.to_csv('train_set.csv',index=False)
test_set.to_csv('test_set.csv',index=False)

# Load preprocessed train data

In [14]:
train_set = pd.read_csv('train_set.csv')

# Fine tuning model(BERT)

In [15]:
model = BertForTokenClassification.from_pretrained('bert-base-uncased',
                                                    num_labels=len(labels))


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Set up training arguments

In [16]:

training_args = TrainingArguments(
    output_dir='./results',  # Output directory for checkpoints and predictions
    num_train_epochs=10,      # Number of training epochs
    per_device_train_batch_size=16,  # Batch size per GPU
    logging_dir='./logs',    # Directory for TensorBoard logs
    logging_steps=500,       # Log metrics every X steps
    save_steps=1000,         # Save checkpoint every X steps
    learning_rate=2e-5,      # Learning rate
)