In [2]:
import pandas as pd
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
import torch


In [3]:
# Label remapping (before proceeding with model training)
label_mapping = {-1: 0, 0: 1, 1: 2}
# Load or define df_cleaned before using it
# Example: Assuming df_cleaned is a cleaned DataFrame loaded from a CSV file
df_cleaned = pd.read_csv("E:/Tanveer Data/YouTubeInsightAI/backend/Data/data.csv")  # Replace with the actual path to your data
df_cleaned['label'] = df_cleaned['label'].map(label_mapping)

# Just to verify
print("Label distribution after remapping:\n", df_cleaned['label'].value_counts())

Label distribution after remapping:
 label
2    101336
1     72460
0     43755
Name: count, dtype: int64


In [9]:
# Features and labels
X = df_cleaned['comment']   # Text input (features)
y = df_cleaned['label']     # Label output (targets)


In [11]:
import pandas as pd
import torch
from transformers import BertTokenizer, BertModel
from tqdm import tqdm

# Load cleaned dataset and remap labels
df = pd.read_csv("E:/Tanveer Data/YouTubeInsightAI/backend/Data/data.csv")
label_mapping = {-1: 0, 0: 1, 1: 2}
df['label'] = df['label'].map(label_mapping)

# Extract X and y
texts = df['comment'].tolist()
labels = df['label'].tolist()

# Load tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')
bert_model.eval()

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bert_model.to(device)

# Function to get CLS embeddings
def get_bert_cls_embeddings(text_list, batch_size=32):
    cls_embeddings = []

    for i in tqdm(range(0, len(text_list), batch_size), desc="Encoding BERT"):
        batch = text_list[i:i+batch_size]
        encodings = tokenizer(batch, return_tensors='pt', truncation=True, padding=True, max_length=128)
        input_ids = encodings['input_ids'].to(device)
        attention_mask = encodings['attention_mask'].to(device)

        with torch.no_grad():
            outputs = bert_model(input_ids=input_ids, attention_mask=attention_mask)
            cls_batch = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # CLS token
            cls_embeddings.append(cls_batch)

    return np.concatenate(cls_embeddings, axis=0)

# Generate embeddings
X_embeddings = get_bert_cls_embeddings(texts)
y = np.array(labels)

print("Shape of BERT embeddings:", X_embeddings.shape)


Encoding BERT:   2%|▏         | 168/6799 [10:50<7:08:00,  3.87s/it]


KeyboardInterrupt: 

In [13]:
import mlflow
import pandas as pd
import torch
from transformers import BertTokenizer
import numpy as np

# Load the model from MLflow
logged_model = 'runs:/c4185e984d1144a7a6a47833428cede8/bert_model'
loaded_model = mlflow.pyfunc.load_model(logged_model)

# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def classify_comments(comments):
    """
    Classify a list of comments into negative, positive, and neutral categories.
    
    Parameters:
    - comments (list of str): List of text comments to classify
    
    Returns:
    - dict: Counts of negative, positive, and neutral comments
    """
    # Step 1: Preprocess the comments and prepare for model inference
    encodings = tokenizer(comments, padding=True, truncation=True, max_length=128, return_tensors='pt')

    # Step 2: Make predictions using the model
    predictions = loaded_model.predict(pd.DataFrame(encodings['input_ids']))
    preds = torch.argmax(torch.tensor(predictions), axis=1)

    # Step 3: Map the predictions back to the original labels (-1, 0, 1 -> negative, neutral, positive)
    label_map = {0: -1, 1: 0, 2: 1}  # Mapping for BERT output to sentiment labels
    sentiment_map = {-1: "negative", 0: "neutral", 1: "positive"}

    # Step 4: Count the number of each sentiment
    sentiment_counts = {"negative": 0, "neutral": 0, "positive": 0}

    for p in preds:
        sentiment = sentiment_map[label_map[p.item()]]
        sentiment_counts[sentiment] += 1

    return sentiment_counts


# Example usage:
comments_list = [
    "I love this product! It's amazing.",
    "This is the worst experience I've had.",
    "It was okay, neither good nor bad."
]

result = classify_comments(comments_list)
print(result)


ModuleNotFoundError: No module named 'packaging.requirements'