In [1]:
import pandas as pd
import numpy as np
import random
import os
import re
import subprocess
import spacy
import matplotlib.pyplot as plt
from sklearn.metrics import auc, roc_curve, confusion_matrix, classification_report
from gensim.models import KeyedVectors
from sklearn.metrics import accuracy_score,f1_score,balanced_accuracy_score
from sklearn.model_selection import StratifiedKFold,RandomizedSearchCV
from xgboost import XGBClassifier

In [2]:
data = pd.read_excel('/Users/rachael/Downloads/train_df_labelled.xlsx')
#the data verified by the third person will be used as the train set finally
file =  data[['content','stance']]
file['stance'].value_counts()
label_stance= file['stance'].value_counts()
label_proportions_stance= file['stance'].value_counts(normalize=True)
print("Counts of each label_stance:")
print(label_stance)
print("\nProportions of each label_stance:")
print(label_proportions_stance)

Counts of each label_stance:
stance
Aganist    186
Neither    140
Support     82
Name: count, dtype: int64

Proportions of each label_stance:
stance
Aganist    0.455882
Neither    0.343137
Support    0.200980
Name: proportion, dtype: float64


In [3]:
def remove_urls(text):
    # 定义URL的正则表达式
    url_pattern = r'https?://\S+|www\.\S+'
    # 使用re.sub()函数替换URL为空字符串
    no_url_text = re.sub(url_pattern, '', text)
    return no_url_text

def clean_text(text):
    # Remove anything that is not a letter or space
    clean_text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Optional: Convert text to lower case
    clean_text = clean_text.lower()
    return clean_text

In [4]:
file['content'] = file['content'].apply(remove_urls) 
file['content'] = file['content'].apply(clean_text) 
file['stance'] = file['stance'].replace({'Support':0,'Aganist': 1, 'Neither': 2})
y = file['stance']
X = file['content']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file['content'] = file['content'].apply(remove_urls)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file['content'] = file['content'].apply(clean_text)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  file['stance'] = file['stance'].replace({'Support':0,'Aganist': 1, 'Neither': 2})


In [5]:
nlp = spacy.load("nl_core_news_sm")
#tokenize
def tokenize(text):
    doc = nlp(text)
    return [token.text for token in doc]

#load word2vec embeddings
model_path = '/Users/rachael/Downloads/39/model.bin'
word_vectors = KeyedVectors.load_word2vec_format(model_path, binary=True)
vector_size = word_vectors.vector_size #100
#convert sentences feature to the average of word vectors
def sentence_to_avg_vec(tokens, model, vector_size):
    vec = np.zeros(vector_size)
    count = 0
    for token in tokens:
        if token in model:
            vec += model[token]
            count += 1
    if count != 0:
        vec /= count
    return vec

In [6]:
#define parameter range 
param_dist = {
    'learning_rate': [item/100 for item in range(1, 31)],
    'max_depth': [item for item in range(3, 11)],
    'subsample': [item/10 for item in range(5, 11)],
    'colsample_bytree': [item/10 for item in range(5, 11)],
    'lambda': [item/10 for item in range(0, 11)],
    'alpha': [item/10 for item in range(0, 11)],
}

best_score = 0
best_params = {}
test_scores = []
best_params_list = []
f1_scores = []
balanced_accuracies =[]

In [7]:

# 分层K折交叉验证
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# 循环每个折叠
for train_index, test_index in outer_cv.split(X, y):
    trainX, testX = X.iloc[train_index], X.iloc[test_index]
    trainy, testy = y.iloc[train_index], y.iloc[test_index]

    # 确保传递字符串数据
    trainX = trainX.astype(str)
    testX = testX.astype(str)

    # 进行tokenize
    X_train_tokenized = trainX.apply(tokenize)
    X_test_tokenized = testX.apply(tokenize)

     #map word embedding to tokenized train and test data
    X_train_embeddings = np.array([sentence_to_avg_vec(tokens, word_vectors, vector_size) for tokens in X_train_tokenized])
    X_test_embeddings= np.array([sentence_to_avg_vec(tokens, word_vectors, vector_size) for tokens in X_test_tokenized])

    xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
    randomized_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=100,  # 采样次数
        scoring='f1_macro',
        n_jobs=-1,  # 使用所有可用的核心
        cv=inner_cv,
        random_state=42,
        verbose=1  # 设置详细程度
    )

    # 拟合随机搜索
    randomized_search.fit(X_train_embeddings, trainy)

    # 获取最佳模型
    best_model = randomized_search.best_estimator_
    best_params = randomized_search.best_params_
    best_score = randomized_search.best_score_

    # 在测试集上进行预测
    test_predictions = best_model.predict(X_test_embeddings)
    balanced_acc = f1_score(testy, test_predictions,average='macro')

    # 存储结果
    balanced_accuracies.append(balanced_acc)
    best_params_list.append(best_params)

    print(f"Fold Balanced Accuracy: {balanced_acc}")
    print(f"Best parameters: {best_params}")

# 汇总结果

mean_balanced_acc = np.mean(balanced_accuracies)
std_balanced_acc = np.std(balanced_accuracies)

print(f"Mean Balanced Accuracy: {mean_balanced_acc}")
print(f"Standard deviation of Balanced Accuracies: {std_balanced_acc}")
print("Best parameters for each fold:")
for i, params in enumerate(best_params_list):
    print(f"Fold {i + 1}: {params}")

Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fold Balanced Accuracy: 0.3969572533654812
Best parameters: {'subsample': 0.6, 'max_depth': 9, 'learning_rate': 0.22, 'lambda': 0.8, 'colsample_bytree': 0.7, 'alpha': 0.3}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fold Balanced Accuracy: 0.5304420618822242
Best parameters: {'subsample': 0.6, 'max_depth': 5, 'learning_rate': 0.13, 'lambda': 0.1, 'colsample_bytree': 0.8, 'alpha': 0.2}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fold Balanced Accuracy: 0.4920696211788796
Best parameters: {'subsample': 0.5, 'max_depth': 9, 'learning_rate': 0.28, 'lambda': 0.9, 'colsample_bytree': 0.5, 'alpha': 0.5}
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fold Balanced Accuracy: 0.522610789291759
Best parameters: {'subsample': 0.7, 'max_depth': 9, 'learning_rate': 0.15, 'lambda': 0.5, 'colsample_bytree': 0.8, 'alpha': 0.9}
Fitting 3 folds for each of 100 candidates, totalling 300 fit

In [8]:
from collections import Counter
best_params_counter = Counter(tuple(sorted(params.items())) for params in best_params_list)
most_common_params = dict(best_params_counter.most_common(1)[0][0])

print("Most common best parameters:", most_common_params)

Most common best parameters: {'alpha': 0.3, 'colsample_bytree': 0.7, 'lambda': 0.8, 'learning_rate': 0.22, 'max_depth': 9, 'subsample': 0.6}


In [6]:
most_common_params = {'subsample': 0.6, 'max_depth': 5, 'learning_rate': 0.13, 'lambda': 0.1, 'colsample_bytree': 0.8, 'alpha': 0.2}


In [7]:
X_sentences = [item for item in list(X)]
X_tokenized = X.apply(tokenize)
X_embeddings = np.array([sentence_to_avg_vec(tokens, word_vectors, 100) for tokens in X_tokenized])
final_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42,**most_common_params)
final_model.fit(X_embeddings, y)

In [8]:
test = pd.read_excel('/Users/rachael/Downloads/test_df_labelled.xlsx')
#process test data 
test = test[['content','stance']]
test['content'] = test['content'].apply(remove_urls) 
test['content'] = test['content'].apply(clean_text) 
test['stance'] = test['stance'].replace({'Support':0,'Aganist': 1, 'Neither': 2})
test_tokenized = test['content'].apply(tokenize)
test_embedding = np.array([sentence_to_avg_vec(tokens,word_vectors,100) for tokens in test_tokenized])

#predict 
test_predictions = final_model.predict(test_embedding)
conf_matrix = confusion_matrix(test['stance'], test_predictions)
class_report = classification_report(test['stance'], test_predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.40      0.18      0.25        76
           1       0.54      0.67      0.60       191
           2       0.55      0.54      0.54       140

    accuracy                           0.53       407
   macro avg       0.50      0.46      0.46       407
weighted avg       0.52      0.53      0.52       407



In [10]:
# Stratified K-fold for maintaining label distribution, shuffle=True ensures different data splits in each iteration
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42)
randomized_search = RandomizedSearchCV(
        estimator=xgb,
        param_distributions=param_dist,
        n_iter=100,  # Number of parameter settings that are sampled
        scoring='f1_macro',
        n_jobs=-1,  # Use all available cores
        cv=inner_cv,
        random_state=42,
        verbose=1  # Set verbosity to get more information during the search
    )

# Loop over each fold
for train_index, test_index in outer_cv.split(X, y):
    trainX, testX = X.iloc[train_index], X.iloc[test_index]
    trainy, testy = y.iloc[train_index], y.iloc[test_index]

    # 将 y_train 和 y_test 转换为 DataFrame
    y_train_df = pd.DataFrame(trainy).reset_index(drop=True)
    y_test_df = pd.DataFrame(testy).reset_index(drop=True)

    # 拼接 X_train 和 y_train 以及 X_test 和 y_test
    train_df = pd.concat([trainX.reset_index(drop=True), y_train_df], axis=1)
    test_df = pd.concat([testX.reset_index(drop=True), y_test_df], axis=1)

    # 标准化处理文本
    train_df['content'] = train_df['content'].apply(lambda x: ' '.join(x.split()))
    sentiment_0 = train_df[train_df['stance'] == 0]
    sentiment_1 = train_df[train_df['stance'] == 1]
    sentiment_2 = train_df[train_df['stance'] == 2]

    # 数据增强
    output_file_path0 = '/Users/rachael/Desktop/data/sentiment_0.txt'
    with open(output_file_path0, 'w', encoding='utf-8') as f:
        for _, row in sentiment_0.iterrows():
            f.write(f"{row['stance']}\t{row['content']}\n")

    subprocess.run([
        'python', '/Users/rachael/Desktop/data/augment.py',
        '--input', output_file_path0,
        '--output', '/Users/rachael/Desktop/data/augmented_train0.txt',
        '--num_aug', '6',
        '--alpha_sr', '0.1',
        '--alpha_rd', '0.1',
        '--alpha_ri', '0.1',
        '--alpha_rs', '0.1'
    ])

    output_file_path1 = '/Users/rachael/Desktop/data/sentiment_1.txt'
    with open(output_file_path1, 'w', encoding='utf-8') as f:
        for _, row in sentiment_1.iterrows():
            f.write(f"{row['stance']}\t{row['content']}\n")

    subprocess.run([
        'python', '/Users/rachael/Desktop/data/augment.py',
        '--input', output_file_path1,
        '--output', '/Users/rachael/Desktop/data/augmented_train1.txt',
        '--num_aug', '2',
        '--alpha_sr', '0.1',
        '--alpha_rd', '0.1',
        '--alpha_ri', '0.1',
        '--alpha_rs', '0.1'
    ])

    output_file_path2 = '/Users/rachael/Desktop/data/sentiment_2.txt'
    with open(output_file_path2, 'w', encoding='utf-8') as f:
        for _, row in sentiment_2.iterrows():
            f.write(f"{row['stance']}\t{row['content']}\n")

    subprocess.run([
        'python', '/Users/rachael/Desktop/data/augment.py',
        '--input', output_file_path2,
        '--output', '/Users/rachael/Desktop/data/augmented_train2.txt',
        '--num_aug', '3',
        '--alpha_sr', '0.1',
        '--alpha_rd', '0.1',
        '--alpha_ri', '0.1',
        '--alpha_rs', '0.1'
    ])

    # Load augmented data
    file4 = pd.read_csv('/Users/rachael/Desktop/data/augmented_train0.txt', delimiter='\t', header=None, names=['stance', 'content'])
    file5 = pd.read_csv('/Users/rachael/Desktop/data/augmented_train1.txt', delimiter='\t', header=None, names=['stance', 'content'])
    file6 = pd.read_csv('/Users/rachael/Desktop/data/augmented_train2.txt', delimiter='\t', header=None, names=['stance', 'content'])
    augmented_train = pd.concat([file4, file5, file6])

    #process train data
    augmented_train['content'] = augmented_train['content'].astype(str)
    y_train = augmented_train['stance']
    X_train = augmented_train['content']
    X_train_tokenized = X_train.apply(tokenize)
    X_train_embeddings = np.array([sentence_to_avg_vec(tokens, word_vectors, vector_size) for tokens in X_train_tokenized])
    #X_train_embeddings = scaler.fit_transform(X_train_embeddings1)

    #prepare test data
    test_df['content'] = test_df['content'].astype(str)
    y_test = test_df['stance']
    X_test = test_df['content']
    X_test_tokenized = X_test.apply(tokenize)
    X_test_embeddings = np.array([sentence_to_avg_vec(tokens, word_vectors, vector_size) for tokens in X_test_tokenized])

    # Fit Randomized Search
    randomized_search.fit(X_train_embeddings, y_train)


    # Get the best model from Randomized Search
    best_model = randomized_search.best_estimator_
    best_params = randomized_search.best_params_
    best_score = randomized_search.best_score_

    # Evaluate on the test set
    test_predictions = best_model.predict(X_test_embeddings)
    test_f1_score = f1_score(y_test, test_predictions,average = 'macro')

    f1_scores.append(test_f1_score)
    best_params_list.append(best_params)

    print(f"Fold F1-score: {test_f1_score}")
    print(f"Best parameters: {best_params}")

# Summarize the results
mean_f1_score = np.mean(f1_scores)
std_f1_score = np.std(f1_scores)

print(f"Mean F1-score: {mean_f1_score}")
print(f"Standard deviation of F1-scores: {std_f1_score}")
print("Best parameters for each fold:")
for i, params in enumerate(best_params_list):
    print(f"Fold {i + 1}: {params}")

generated augmented sentences with eda for /Users/rachael/Desktop/data/sentiment_0.txt to /Users/rachael/Desktop/data/augmented_train0.txt with num_aug=6
generated augmented sentences with eda for /Users/rachael/Desktop/data/sentiment_1.txt to /Users/rachael/Desktop/data/augmented_train1.txt with num_aug=2
generated augmented sentences with eda for /Users/rachael/Desktop/data/sentiment_2.txt to /Users/rachael/Desktop/data/augmented_train2.txt with num_aug=3
Fitting 3 folds for each of 100 candidates, totalling 300 fits
Fold F1-score: 0.45846560846560847
Best parameters: {'subsample': 0.5, 'max_depth': 9, 'learning_rate': 0.1, 'lambda': 0.3, 'colsample_bytree': 0.9, 'alpha': 0.0}
generated augmented sentences with eda for /Users/rachael/Desktop/data/sentiment_0.txt to /Users/rachael/Desktop/data/augmented_train0.txt with num_aug=6
generated augmented sentences with eda for /Users/rachael/Desktop/data/sentiment_1.txt to /Users/rachael/Desktop/data/augmented_train1.txt with num_aug=2
gene

In [8]:
from collections import Counter
best_params_counter = Counter(tuple(sorted(params.items())) for params in best_params_list)
most_common_params = dict(best_params_counter.most_common(1)[0][0])

print("Most common best parameters:", most_common_params)

NameError: name 'best_params_list' is not defined

In [9]:
most_common_params = {'subsample': 0.6, 'max_depth': 8, 'learning_rate': 0.25, 'lambda': 0.4, 'colsample_bytree': 0.8, 'alpha': 0.1}


In [10]:
X_sentences = [item for item in list(X)]
X_tokenized = X.apply(tokenize)
X_embeddings = np.array([sentence_to_avg_vec(tokens, word_vectors, 100) for tokens in X_tokenized])
final_model = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss', random_state=42,**most_common_params)
final_model.fit(X_embeddings, y)

In [11]:
test = pd.read_excel('/Users/rachael/Downloads/test_df_labelled.xlsx')
#process test data 
test = test[['content','stance']]
test['content'] = test['content'].apply(remove_urls) 
test['content'] = test['content'].apply(clean_text) 
test['stance'] = test['stance'].replace({'Support':0,'Aganist': 1, 'Neither': 2})
test_tokenized = test['content'].apply(tokenize)
test_embedding = np.array([sentence_to_avg_vec(tokens,word_vectors,100) for tokens in test_tokenized])

#predict 
test_predictions = final_model.predict(test_embedding)
conf_matrix = confusion_matrix(test['stance'], test_predictions)
class_report = classification_report(test['stance'], test_predictions)
print(class_report)

              precision    recall  f1-score   support

           0       0.33      0.20      0.25        76
           1       0.54      0.62      0.58       191
           2       0.52      0.53      0.52       140

    accuracy                           0.51       407
   macro avg       0.47      0.45      0.45       407
weighted avg       0.50      0.51      0.50       407

