In [1]:
import pandas as pd

df = pd.read_csv("Processed_combined.csv")

In [2]:

df['Transcribed text'] = df['Transcribed text'].astype(str).fillna("")

df = df.dropna(subset=['Transcribed text'])

df.head()

Unnamed: 0,Transcribed text,label
0,make video embarrassing behavior normal person...,well-being
1,machine garden thing day sun today water tomor...,well-being
2,biggest paradigm shift made happiness skill mu...,well-being
3,happiness emotional response outcome win effec...,well-being
4,mojo save ditch video tip work tip happiness h...,well-being


In [4]:
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

from bertopic import BERTopic

def analyze_topics(texts):
    """
    Analyze topics in the given text using BERTopic.
    """
    # Initialize BERTopic model
    model = BERTopic(verbose=True, embedding_model='paraphrase-MiniLM-L3-v2', min_topic_size=7)
    topics, _ = model.fit_transform(texts)
    
    # Get and display topic information
    freq = model.get_topic_info()
    print("Number of topics: {}".format(len(freq)))
    display(freq.head(20))
    
    return model, topics

# Apply topic analysis
model, topics = analyze_topics(df['Transcribed text'])

# Assign topics back to the DataFrame
df['topic'] = topics

df.to_csv('topics.csv', index=False)

2024-12-20 00:27:03,137 - BERTopic - Embedding - Transforming documents to embeddings.
Batches: 100%|██████████| 6/6 [00:01<00:00,  3.08it/s]
2024-12-20 00:27:07,785 - BERTopic - Embedding - Completed ✓
2024-12-20 00:27:07,786 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2024-12-20 00:27:18,877 - BERTopic - Dimensionality - Completed ✓
2024-12-20 00:27:18,882 - BERTopic - Cluster - Start clustering the reduced embeddings
2024-12-20 00:27:18,898 - BERTopic - Cluster - Completed ✓
2024-12-20 00:27:18,903 - BERTopic - Representation - Extracting topics from clusters using representation models.
2024-12-20 00:27:18,965 - BERTopic - Representation - Completed ✓


Number of topics: 6


Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,67,-1_thing_people_time_make,"[thing, people, time, make, life, back, friend...",[hey guy back channel valerina home jeanette k...
1,0,29,0_life_people_time_thing,"[life, people, time, thing, day, cat, happines...",[guy danny thinking past couple month honestly...
2,1,28,1_gon_na_thing_good,"[gon, na, thing, good, day, yeah, eat, video, ...",[lovely people today chat kind heavy man real ...
3,2,26,2_money_data_business_make,"[money, data, business, make, start, job, thin...",[past year created online business generated m...
4,3,23,3_card_socrates_move_knight,"[card, socrates, move, knight, queen, game, pl...",[talk card game playing card game specifically...
5,4,11,4_meditation_film_thing_time,"[meditation, film, thing, time, life, felt, de...",[wheaton actor writer producer generalized dis...


In [None]:
# Import necessary packages
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from datasets import Dataset
from sklearn.metrics import precision_recall_fscore_support, accuracy_score

df = pd.read_csv("Processed_combined.csv")
df['Transcribed text'] = df['Transcribed text'].astype(str).fillna("")
df = df.dropna(subset=['Transcribed text'])


# Rename the columns for better understanding
df = df.rename(columns={'Transcribed text': 'text', 'label': 'label'})

# Encode labels numerically
label_dict = {label: idx for idx, label in enumerate(df['label'].unique())}
df['label'] = df['label'].map(label_dict)

# Split the data into training and validation sets
train_df, eval_df = train_test_split(df, test_size=0.1, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization function to preprocess the text data
def tokenize_data(example):
    return tokenizer(example['text'], padding='max_length', truncation=True, max_length=128)

# Create Hugging Face Dataset objects
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(eval_df)
train_dataset = train_dataset.map(tokenize_data, batched=True)
eval_dataset = eval_dataset.map(tokenize_data, batched=True)

# Load BERT with a sequence classification head
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_dict))

# Metric computation: we define this to get more details on the performance of the model
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Training arguments
training_args = TrainingArguments(
    output_dir='/results',
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=16,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='/logs',
    logging_steps=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained('/bert_finetuned')
tokenizer.save_pretrained('bert_finetuned')

print("Fine-tuned BERT model and tokenizer saved!")

In [None]:
# Let's check the model's final performance!
results = trainer.evaluate()
print(results)