In [3]:
# Imports for data analysis and other fun things :3
import kagglehub
import pandas as pd
import ast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, DistilBertConfig, TrainingArguments, Trainer
import torch
from torch.utils.data import TensorDataset

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# Get dataset from Kaggle downloaded on any machine
path = kagglehub.dataset_download("mobasshir/yelpdata")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\Nixon Showalter\.cache\kagglehub\datasets\mobasshir\yelpdata\versions\2


In [5]:
# Import the two required data sets into variables
bus_df = pd.read_csv(f"{path}/yelp_business.csv")
review_df = pd.read_csv(f"{path}/yelp_review_arizona.csv")

In [6]:
# Display the column names of each set
print(bus_df.columns)
print(review_df.columns)

Index(['business_id', 'name', 'address', 'city', 'state', 'postal_code',
       'latitude', 'longitude', 'stars', 'review_count', 'is_open',
       'attributes', 'categories', 'hours'],
      dtype='object')
Index(['review_id', 'user_id', 'business_id', 'text', 'stars', 'date'], dtype='object')


In [12]:
# Clean the business data to the required specifications
print("Business Data Pre-Cleaning: ", bus_df.shape)

clean_bus_df = bus_df[
    (bus_df['review_count'] > 50) &
    (bus_df['attributes'].size > 3)
]

print("Business Data Post-Cleaning: ", clean_bus_df.shape)


Business Data Pre-Cleaning:  (192609, 14)
Business Data Post-Cleaning:  (26524, 14)


In [13]:
# Merge all of the reviews for each business into one column
per_bus_reviews = review_df.groupby('business_id')['text'].apply(lambda text: " ".join(text))

# Merges the business and review data sets into one
merged_df = pd.merge(clean_bus_df, per_bus_reviews, on='business_id', how='inner')

In [9]:
unique_keys = []
target_labels = []

# Changes the attributes category from a string that looks like a dict to a dict
bus_attributes = merged_df['attributes'].apply(lambda x: eval(x))


# Creates a list of all of the unique attributes a business can have
for attributes_dict in bus_attributes:
    for key in attributes_dict.keys():
        if key not in unique_keys:
            unique_keys.append(key)

unique_attributes = sorted(unique_keys)

print("Unique attributes:", unique_attributes)

# Creates a list of all of the labels each business has seperately
for attributes_dict in bus_attributes:
    current_labels = []
    for attribute in unique_attributes:
        if attribute in attributes_dict:
            current_labels.append(attribute)
    target_labels.append(current_labels)

# Preps the multilabel binarizer and the fit labels for the BERT model
mlb = MultiLabelBinarizer(classes=unique_attributes)
labels = mlb.fit_transform(target_labels)

Unique attributes: ['AgesAllowed', 'Alcohol', 'Ambience', 'BYOB', 'BYOBCorkage', 'BestNights', 'BikeParking', 'BusinessAcceptsBitcoin', 'BusinessAcceptsCreditCards', 'BusinessParking', 'ByAppointmentOnly', 'Caters', 'CoatCheck', 'Corkage', 'DietaryRestrictions', 'DogsAllowed', 'DriveThru', 'GoodForDancing', 'GoodForKids', 'GoodForMeal', 'HappyHour', 'HasTV', 'Music', 'NoiseLevel', 'Open24Hours', 'OutdoorSeating', 'RestaurantsAttire', 'RestaurantsCounterService', 'RestaurantsDelivery', 'RestaurantsGoodForGroups', 'RestaurantsPriceRange2', 'RestaurantsReservations', 'RestaurantsTableService', 'RestaurantsTakeOut', 'Smoking', 'WheelchairAccessible', 'WiFi']


In [10]:
# Splits the testing and training data
train_texts, test_texts, y_train, y_test = train_test_split(merged_df['text'], labels, test_size=0.2, random_state=9)

In [None]:
# Generates the tokenizer, config, and model using DistilBert
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

config = DistilBertConfig.from_pretrained('distilbert-base-uncased', num_labels=len(unique_attributes), problem_type="multi_label_classification")

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', config=config)

# Generates the encodings for both the training and testing data
# Hyperparams set to avoid throwing errors when using the TensorDataset
train_encodings = tokenizer(train_texts.tolist(), truncation=True, padding='max_length', max_length=256,return_tensors='pt')

test_encodings = tokenizer(test_texts.tolist(), truncation=True, padding='max_length', max_length=256, return_tensors='pt')

# Gets the tensor from torch to create the dataset that the model will train on
# Torch float changes output type for model to cast as a Float later
train_labels_tensor = torch.tensor(y_train, dtype=torch.float)
test_labels_tensor = torch.tensor(y_test, dtype=torch.float)

# Creates the training and testing datasets that can be used with DistilBERT
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels_tensor)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels_tensor)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [29]:
# Creates the list of training arguments, using just the bare minimum arguments
training_args = TrainingArguments(
    output_dir='./distilbert_results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16
)

# Creates a function that will collect the feature outputs from the model
# Used to solve error where training the model would throw a value error
def data_collator(features):
    data = {}
    data['input_ids'] = torch.stack([f[0] for f in features])
    data['attention_mask'] = torch.stack([f[1] for f in features])
    data['labels'] = torch.stack([f[2] for f in features])
    
    return data

# Creates the list of arguments for the trainer utilizing the variables created above
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    data_collator=data_collator
)

In [30]:
# Trains and evaluates the model, printing out the results
trainer.train()
eval_results = trainer.evaluate()
print("Results:", eval_results)


Step,Training Loss


Results: {'eval_loss': 0.238334521651268, 'eval_runtime': 46.2054, 'eval_samples_per_second': 6.125, 'eval_steps_per_second': 0.39, 'epoch': 3.0}
