In [1]:
import pandas as pd

# Set the path to the dataset in the server
file_path = "/Disk1/vish/openvla/archive-2/twcs/twcs.csv"

# Load the CSV file
df = pd.read_csv(file_path)

# Show first few rows to confirm it's loaded
print(df.shape)
df.head()

(2811774, 7)


Unnamed: 0,tweet_id,author_id,inbound,created_at,text,response_tweet_id,in_response_to_tweet_id
0,1,sprintcare,False,Tue Oct 31 22:10:47 +0000 2017,@115712 I understand. I would like to assist y...,2.0,3.0
1,2,115712,True,Tue Oct 31 22:11:45 +0000 2017,@sprintcare and how do you propose we do that,,1.0
2,3,115712,True,Tue Oct 31 22:08:27 +0000 2017,@sprintcare I have sent several private messag...,1.0,4.0
3,4,sprintcare,False,Tue Oct 31 21:54:49 +0000 2017,@115712 Please send us a Private Message so th...,3.0,5.0
4,5,115712,True,Tue Oct 31 21:49:35 +0000 2017,@sprintcare I did.,4.0,6.0


In [2]:
import numpy as np

# Define expanded category assignment function
def expanded_category(text):
    text_lower = text.lower()

    if any(kw in text_lower for kw in ["charge", "bill", "payment", "refund", "invoice", "overcharged", "fee"]):
        return "billing"
    elif any(kw in text_lower for kw in ["error", "issue", "not working", "bug", "crash", "glitch", "slow", "update"]):
        return "technical"
    elif any(kw in text_lower for kw in ["login", "account", "password", "access", "locked", "reset", "credentials"]):
        return "account"
    elif any(kw in text_lower for kw in ["delivery", "arrive", "late", "shipping", "delayed", "tracking", "package"]):
        return "delivery"
    elif any(kw in text_lower for kw in ["product", "item", "order", "quality", "broken", "damaged", "warranty"]):
        return "product"
    elif any(kw in text_lower for kw in ["help", "support", "assistance", "customer service", "agent", "representative"]):
        return "support"
    else:
        return "general"

# Apply to inbound tweets only
df["category"] = df.apply(lambda row: expanded_category(row["text"]) if row["inbound"] else np.nan, axis=1)

# Display a few newly categorized inbound tweets
inbound_expanded = df[df["inbound"]][["text", "category"]].head(94)
inbound_expanded


Unnamed: 0,text,category
1,@sprintcare and how do you propose we do that,general
2,@sprintcare I have sent several private messag...,general
4,@sprintcare I did.,general
6,@sprintcare is the worst customer service,support
8,@sprintcare You gonna magically change your co...,general
...,...,...
162,Any help here @AdobeCare? https://t.co/x50e57UG4E,support
164,@AdobeCare nope still down.,general
166,@AdobeCare yes ive done that multiple times. i...,general
168,@AdobeCare ok,general


In [3]:
from sklearn.model_selection import train_test_split

# Filter dataset: inbound tweets only with valid category
filtered_df = df[(df["inbound"]) & (df["category"].notnull())]

# Features and target
X = filtered_df["text"]
y = filtered_df["category"]

# Split into train and test sets (80/20 split)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Show split sizes
len(X_train), len(X_test)


(1230274, 307569)

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix

# Create a pipeline: TF-IDF vectorizer + Logistic Regression
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english', max_features=1000)),
    ('clf', LogisticRegression(max_iter=200, random_state=42))
])

# Train the model
pipeline.fit(X_train, y_train)

# Predict on test set
y_pred = pipeline.predict(X_test)

# Evaluation results
report = classification_report(y_test, y_pred, output_dict=False)
conf_matrix = confusion_matrix(y_test, y_pred)

(report, conf_matrix)


('              precision    recall  f1-score   support\n\n     account       0.96      0.91      0.93     11263\n     billing       0.99      0.73      0.84     19279\n    delivery       0.97      0.91      0.94     17358\n     general       0.93      1.00      0.96    138406\n     product       0.95      0.90      0.92     12432\n     support       0.95      0.96      0.95     82242\n   technical       0.97      0.87      0.92     26589\n\n    accuracy                           0.95    307569\n   macro avg       0.96      0.90      0.92    307569\nweighted avg       0.95      0.95      0.94    307569\n',
 array([[ 10234,     11,     57,    460,     71,    395,     35],
        [   275,  14138,    229,   2844,    234,   1158,    401],
        [     0,     12,  15804,    900,    171,    452,     19],
        [     0,     22,      0, 137716,      0,    528,    140],
        [     0,      9,      0,    676,  11193,    523,     31],
        [     0,     49,      1,   3395,      0,  78617,

In [5]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

     account       0.96      0.91      0.93     11263
     billing       0.99      0.73      0.84     19279
    delivery       0.97      0.91      0.94     17358
     general       0.93      1.00      0.96    138406
     product       0.95      0.90      0.92     12432
     support       0.95      0.96      0.95     82242
   technical       0.97      0.87      0.92     26589

    accuracy                           0.95    307569
   macro avg       0.96      0.90      0.92    307569
weighted avg       0.95      0.95      0.94    307569



In [13]:
# Import libraries
from transformers import DistilBertTokenizerFast, DistilBertForSequenceClassification, Trainer, TrainingArguments, EarlyStoppingCallback
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
from torch.utils.data import Dataset
import torch
import numpy as np
import pandas as pd
import os

# File path to CSV dataset
file_path = "/Disk1/vish/openvla/archive-2/twcs/twcs.csv"

# Loading dataset with limited rows for faster training
df = pd.read_csv(file_path, nrows=100000)

# Define a function to assign a category based on text
def assign_category(text):
    text = str(text).lower()
    if any(kw in text for kw in ["charge", "bill", "payment", "refund", "overcharged", "fee"]):
        return "billing"
    elif any(kw in text for kw in ["error", "issue", "not working", "bug", "crash", "glitch", "slow", "update"]):
        return "technical"
    elif any(kw in text for kw in ["login", "account", "password", "access", "locked", "reset", "credentials"]):
        return "account"
    elif any(kw in text for kw in ["delivery", "arrive", "late", "shipping", "delayed", "tracking", "package"]):
        return "delivery"
    elif any(kw in text for kw in ["product", "item", "order", "quality", "broken", "damaged", "warranty"]):
        return "product"
    elif any(kw in text for kw in ["help", "support", "assistance", "customer service", "agent", "representative"]):
        return "support"
    else:
        return "general"

# Filter for inbound messages and assign categories
df = df[df["inbound"] == True].copy()
df["category"] = df["text"].apply(assign_category)
df = df[["text", "category"]].dropna().reset_index(drop=True)

# Encode label classes to numerical IDs
le = LabelEncoder()
df["label"] = le.fit_transform(df["category"])

label_names = le.classes_

# Split dataset into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    df["text"], df["label"], 
    test_size=0.2, 
    random_state=42, 
    stratify=df["label"]
)

# Reset index after split
X_train = X_train.reset_index(drop=True)
X_val = X_val.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
y_val = y_val.reset_index(drop=True)

# Tokenize the text inputs
model_name = "distilbert-base-uncased"

tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

train_encodings = tokenizer(
    list(X_train), truncation=True, padding='max_length', max_length=128
)
val_encodings = tokenizer(
    list(X_val), truncation=True, padding='max_length', max_length=128
)

# Prepare custom dataset class for PyTorch
class TweetDataset(Dataset):
    """Custom dataset for DistilBERT fine-tuning."""
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)


# Create training and validation dataset instances
train_dataset = TweetDataset(train_encodings, y_train)
val_dataset = TweetDataset(val_encodings, y_val)

# Loading DistilBERT with custom number of output classes
model = DistilBertForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", 
    num_labels=len(label_names)
)

# Define training arguments with early stopping
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=10,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_steps=20,
    report_to='none',
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False,
    fp16=True
)

# Early stopping callback
early_stopping = EarlyStoppingCallback(
    early_stopping_patience=3,
    early_stopping_threshold=0.01
)

# Initialize trainer
trainer = Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    args=training_args,
    callbacks=[early_stopping],
)

# Train the model
trainer.train()

# Evaluate the trained model
preds = trainer.predict(val_dataset)
y_pred = preds.predictions.argmax(-1)

print(classification_report(y_val, y_pred, target_names=label_names))

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.1424,0.140418
2,0.0557,0.11771
3,0.0562,0.105712
4,0.064,0.090887
5,0.0309,0.120557
6,0.0775,0.069395
7,0.0231,0.066477
8,0.0,0.06718
9,0.031,0.071311




              precision    recall  f1-score   support

     account       1.00      1.00      1.00       406
     billing       1.00      0.99      1.00       710
    delivery       1.00      0.99      0.99       651
     general       1.00      1.00      1.00      4779
     product       0.99      1.00      0.99       446
     support       1.00      1.00      1.00      3106
   technical       1.00      0.99      0.99       892

    accuracy                           1.00     10990
   macro avg       1.00      0.99      0.99     10990
weighted avg       1.00      1.00      1.00     10990

