In [None]:
!pip install pandas scikit-learn torch transformers

Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
!pip install xgboost scikit-learn pandas




In [None]:
import pandas as pd
import ast
import numpy as np  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import xgboost as xgb

df = pd.read_csv("processed_train.csv")

def safe_eval(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

df['expanded emotion cause evidence'] = df['expanded emotion cause evidence'].apply(safe_eval)
df['expanded emotion cause span'] = df['expanded emotion cause span'].apply(safe_eval)

cause_clauses = set()
for _, row in df.iterrows():
    for cid in row['expanded emotion cause evidence'] + row['expanded emotion cause span']:
        cause_clauses.add((row['conv_id'], cid))

def get_annotation(row):
    is_emotion = row['emotion'] != 'neutral' and row['emotion'] != ''
    is_cause = (row['conv_id'], row['clause_number']) in cause_clauses
    if is_emotion and is_cause:
        return 'both'
    elif is_emotion:
        return 'emotion'
    elif is_cause:
        return 'cause'
    else:
        return 'neither'

df['annotation'] = df.apply(get_annotation, axis=1)

tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))
X_text = tfidf.fit_transform(df['clause'])

df['clause_len'] = df['clause'].apply(lambda x: len(x.split()))
df['is_emotion_present'] = df['emotion'].apply(lambda x: int(x != '' and x != 'neutral'))

le_speaker = LabelEncoder()
le_emotion = LabelEncoder()
df['speaker_enc'] = le_speaker.fit_transform(df['speaker'])
df['emotion_enc'] = le_emotion.fit_transform(df['emotion'])

meta_features = df[['clause_len', 'turn', 'speaker_enc', 'emotion_enc', 'is_emotion_present']]
scaler = StandardScaler()
X_meta = scaler.fit_transform(meta_features)

X_combined = hstack([X_text, X_meta])

le_target = LabelEncoder()
y = le_target.fit_transform(df['annotation'])

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, stratify=y, random_state=42
)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softprob',
    'num_class': len(le_target.classes_),
    'eval_metric': 'mlogloss',
    'max_depth': 8,
    'learning_rate': 0.05,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'scale_pos_weight': 1.5,
    'seed': 42
}

evals = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=10
)

y_pred_prob = bst.predict(dtest)
y_pred = [np.argmax(prob) for prob in y_pred_prob]  # Use np.argmax after importing numpy
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

df['annotation'] = le_target.inverse_transform(bst.predict(xgb.DMatrix(X_combined)).argmax(axis=1))
df.to_csv("annotated_output3.csv", index=False)
print("✅ Saved: annotated_output3.csv")


Parameters: { "scale_pos_weight" } are not used.



[0]	train-mlogloss:1.35010	eval-mlogloss:1.35214
[10]	train-mlogloss:1.07435	eval-mlogloss:1.09673
[20]	train-mlogloss:0.90856	eval-mlogloss:0.94675
[30]	train-mlogloss:0.80498	eval-mlogloss:0.85581
[40]	train-mlogloss:0.73621	eval-mlogloss:0.79773
[50]	train-mlogloss:0.68801	eval-mlogloss:0.75751
[60]	train-mlogloss:0.65347	eval-mlogloss:0.72939
[70]	train-mlogloss:0.62945	eval-mlogloss:0.71087
[80]	train-mlogloss:0.60908	eval-mlogloss:0.69581
[90]	train-mlogloss:0.59361	eval-mlogloss:0.68474
[100]	train-mlogloss:0.58066	eval-mlogloss:0.67617
[110]	train-mlogloss:0.57033	eval-mlogloss:0.67016
[120]	train-mlogloss:0.56184	eval-mlogloss:0.66583
[130]	train-mlogloss:0.55414	eval-mlogloss:0.66247
[140]	train-mlogloss:0.54755	eval-mlogloss:0.65952
[150]	train-mlogloss:0.54154	eval-mlogloss:0.65745
[160]	train-mlogloss:0.53665	eval-mlogloss:0.65601
[170]	train-mlogloss:0.53204	eval-mlogloss:0.65503
[180]	train-mlogloss:0.52773	eval-mlogloss:0.65418
[190]	train-mlogloss:0.52379	eval-mlogloss

In [None]:
import pandas as pd
import ast
import numpy as np  
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from scipy.sparse import hstack
import xgboost as xgb

df = pd.read_csv("processed_test.csv")

def safe_eval(x):
    try:
        return ast.literal_eval(x)
    except:
        return []

df['expanded emotion cause evidence'] = df['expanded emotion cause evidence'].apply(safe_eval)
df['expanded emotion cause span'] = df['expanded emotion cause span'].apply(safe_eval)

cause_clauses = set()
for _, row in df.iterrows():
    for cid in row['expanded emotion cause evidence'] + row['expanded emotion cause span']:
        cause_clauses.add((row['conv_id'], cid))

def get_annotation(row):
    is_emotion = row['emotion'] != 'neutral' and row['emotion'] != ''
    is_cause = (row['conv_id'], row['clause_number']) in cause_clauses
    if is_emotion and is_cause:
        return 'both'
    elif is_emotion:
        return 'emotion'
    elif is_cause:
        return 'cause'
    else:
        return 'neither'

df['annotation'] = df.apply(get_annotation, axis=1)

tfidf = TfidfVectorizer(max_features=1500, ngram_range=(1, 2))
X_text = tfidf.fit_transform(df['clause'])

df['clause_len'] = df['clause'].apply(lambda x: len(x.split()))
df['is_emotion_present'] = df['emotion'].apply(lambda x: int(x != '' and x != 'neutral'))

le_speaker = LabelEncoder()
le_emotion = LabelEncoder()
df['speaker_enc'] = le_speaker.fit_transform(df['speaker'])
df['emotion_enc'] = le_emotion.fit_transform(df['emotion'])

meta_features = df[['clause_len', 'turn', 'speaker_enc', 'emotion_enc', 'is_emotion_present']]
scaler = StandardScaler()
X_meta = scaler.fit_transform(meta_features)

X_combined = hstack([X_text, X_meta])

le_target = LabelEncoder()
y = le_target.fit_transform(df['annotation'])

X_train, X_test, y_train, y_test = train_test_split(
    X_combined, y, test_size=0.2, stratify=y, random_state=42
)

dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

params = {
    'objective': 'multi:softprob',
    'num_class': len(le_target.classes_),
    'eval_metric': 'mlogloss',
    'max_depth': 8,
    'learning_rate': 0.05,
    'subsample': 0.85,
    'colsample_bytree': 0.85,
    'scale_pos_weight': 1.5,
    'seed': 42
}

evals = [(dtrain, 'train'), (dtest, 'eval')]
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=500,
    evals=evals,
    early_stopping_rounds=20,
    verbose_eval=10
)

y_pred_prob = bst.predict(dtest)
y_pred = [np.argmax(prob) for prob in y_pred_prob]  # Use np.argmax after importing numpy
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=le_target.classes_))

df['annotation'] = le_target.inverse_transform(bst.predict(xgb.DMatrix(X_combined)).argmax(axis=1))
df.to_csv("annotated_output3_test.csv", index=False)
print("✅ Saved: annotated_output3_test.csv")


Parameters: { "scale_pos_weight" } are not used.



[0]	train-mlogloss:1.35058	eval-mlogloss:1.35436
[10]	train-mlogloss:1.06342	eval-mlogloss:1.10588
[20]	train-mlogloss:0.88930	eval-mlogloss:0.95643
[30]	train-mlogloss:0.77784	eval-mlogloss:0.86629
[40]	train-mlogloss:0.70171	eval-mlogloss:0.80702
[50]	train-mlogloss:0.64775	eval-mlogloss:0.76735
[60]	train-mlogloss:0.60904	eval-mlogloss:0.73951
[70]	train-mlogloss:0.58045	eval-mlogloss:0.72176
[80]	train-mlogloss:0.55538	eval-mlogloss:0.70678
[90]	train-mlogloss:0.53553	eval-mlogloss:0.69629
[100]	train-mlogloss:0.51920	eval-mlogloss:0.68734
[110]	train-mlogloss:0.50533	eval-mlogloss:0.68271
[120]	train-mlogloss:0.49331	eval-mlogloss:0.67972
[130]	train-mlogloss:0.48324	eval-mlogloss:0.67689
[140]	train-mlogloss:0.47306	eval-mlogloss:0.67412
[150]	train-mlogloss:0.46481	eval-mlogloss:0.67331
[160]	train-mlogloss:0.45723	eval-mlogloss:0.67263
[170]	train-mlogloss:0.45017	eval-mlogloss:0.67271
[180]	train-mlogloss:0.44335	eval-mlogloss:0.67324
[182]	train-mlogloss:0.44215	eval-mlogloss