In [1]:
! pip install torch



In [1]:
import os
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import torch

In [2]:
# Place the data and training index files in the same directory as the executing script
data_folder_path = 'PA1-data'  # Specify the path to the folder containing data
txt_file_path = 'training_new.txt'  # Specify the path to the file containing training data indices

In [3]:
# Read 1095 text files
documents = []
labels = []

for filename in os.listdir(data_folder_path):
    with open(os.path.join(data_folder_path, filename), 'r', encoding='utf-8') as file:
        documents.append(file.read())

        # Extract the class ID from the file name without considering the extension
        class_id = int(filename.split('_')[0].replace('.txt', ''))
        labels.append(class_id)

In [4]:
# Create a DataFrame for easier handling
df = pd.DataFrame({'text': documents, 'label': labels})

# Read class and training data information
training_data_dict = {}
training_data_index = []
with open(txt_file_path, 'r') as file:
    for line in file:
        line = line.strip()
        elements = line.split(' ')

        class_id = int(elements[0])
        training_data = [int(idx) for idx in elements[1:]]
        training_data_dict[class_id] = training_data
        training_data_index.append(training_data)

In [5]:
# Convert the list of lists to a flat list
training_data_index_flat = [item for sublist in training_data_index for item in sublist]

# Create the training dataset
training_data = []

for class_id, data_list in training_data_dict.items():
    for idx in data_list:
        training_data.append({'text': df.iloc[idx]['text'], 'label': class_id})

# Convert training data to a DataFrame
train_df = pd.DataFrame(training_data)

In [34]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('google/bert_uncased_L-2_H-128_A-2')
# model = BertModel.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased', num_hidden_layers=6)
model = BertModel.from_pretrained('google/bert_uncased_L-2_H-128_A-2')



In [24]:
# Function to extract [CLS] embeddings for a list of texts
def extract_cls_embeddings(texts):
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embeddings = outputs.last_hidden_state[:, 0, :].numpy()
    return cls_embeddings


# Extract [CLS] embeddings for training data
X_train_texts = train_df['text'].tolist()
y_train_labels = train_df['label'].tolist()
X_train_embeddings = extract_cls_embeddings(X_train_texts)

In [25]:
# Train SVM model
svm_model = SVC(kernel='linear', random_state=3)
svm_model.fit(X_train_embeddings.reshape(len(X_train_texts), -1), y_train_labels)

In [26]:
# Use the remaining documents as the testing dataset, excluding those in training_data_index
test_df = df[~df['label'].isin(training_data_index_flat)]
X_test_texts = test_df['text'].tolist()

In [27]:
# Set batch size for processing test data
batch_size = 32
num_batches = len(X_test_texts) // batch_size

In [28]:
# Store predicted labels for each batch
predicted_labels_test_svm = []

# Process test data in batches
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = (i + 1) * batch_size
    X_test_embeddings_batch = extract_cls_embeddings(X_test_texts[start_idx:end_idx])

    # Predict labels for each batch
    predicted_labels_batch = svm_model.predict(X_test_embeddings_batch.reshape(len(X_test_embeddings_batch), -1))
    predicted_labels_test_svm.extend(predicted_labels_batch)

# Handle the last batch (which may have fewer elements than batch_size)
if num_batches * batch_size < len(X_test_texts):
    start_idx = num_batches * batch_size
    X_test_embeddings_last_batch = extract_cls_embeddings(X_test_texts[start_idx:])
    predicted_labels_last_batch = svm_model.predict(X_test_embeddings_last_batch.reshape(len(X_test_embeddings_last_batch), -1))
    predicted_labels_test_svm.extend(predicted_labels_last_batch)

# Add predicted labels to the test data DataFrame
test_df['predicted_label'] = predicted_labels_test_svm


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['predicted_label'] = predicted_labels_test_svm


In [31]:
# Create a DataFrame for the predicted results
# result_df_svm = pd.DataFrame({'Id': test_df.index, 'Value': predicted_labels_test_svm})
result_df_svm = pd.DataFrame({'Id': test_df['label'], 'Value': predicted_labels_test_svm})
# Sort the DataFrame by the 'Id' column
result_df_svm = result_df_svm.sort_values(by='Id')

# Save the DataFrame to a CSV file
result_df_svm.to_csv('predicted_results_SVM_04.csv', index=False)

In [32]:
result_df_svm

Unnamed: 0,Id,Value
111,17,4
1073,18,2
842,20,5
869,21,8
939,22,5
...,...,...
60,1091,3
120,1092,1
74,1093,7
248,1094,11
