# Loading in the Dataframe

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import pipeline
import re
import nltk
from sklearn.model_selection import train_test_split
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
from pyspark.ml import Pipeline
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenpinyu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Load your collected CSV
data_path = r"/Users/chenpinyu/Desktop/spark/notebooks/Dirty_structured_output.csv"
df = pd.read_csv(data_path)

In [3]:
df

Unnamed: 0,aid,title,summary,main_category,categories,published
0,http://arxiv.org/abs/2504.02678v1,Valley and Spin Polarized States in Bernal Bil...,We present the results for the evolution of th...,cond-mat.str-el,"cond-mat.str-el,cond-mat.mes-hall",2025-04-03T15:15:42Z
1,http://arxiv.org/abs/2504.15872v1,Multiscale detection of practically significan...,In many change point problems it is reasonable...,stat.ME,"stat.ME,math.ST,stat.TH",2025-04-22T13:16:44Z
2,http://arxiv.org/abs/2504.17570v1,Atemporality from Conservation Laws of Physics...,Recent results have shown that singularities c...,gr-qc,"gr-qc,astro-ph.HE,hep-th",2025-04-24T13:59:42Z
3,http://arxiv.org/abs/2504.16897v1,Assessing SSL/TLS Certificate Centralization: ...,SSL/TLS is a fundamental technology in the net...,cs.NI,cs.NI,2025-04-23T17:26:18Z
4,http://arxiv.org/abs/2504.11344v1,Interpretable Hybrid-Rule Temporal Point Proce...,Temporal Point Processes (TPPs) are widely use...,cs.LG,"cs.LG,cs.AI,stat.ML",2025-04-15T16:15:16Z
...,...,...,...,...,...,...
7697,http://arxiv.org/abs/2504.17321v1,Dargana: fine-tuning EarthPT for dynamic tree ...,"We present Dargana, a fine-tuned variant of th...",physics.geo-ph,"physics.geo-ph,cs.LG",2025-04-24T07:23:27Z
7698,http://arxiv.org/abs/2504.01347v1,MEEK: Re-thinking Heterogeneous Parallel Error...,Heterogeneous parallel error detection is an a...,cs.AR,cs.AR,2025-04-02T04:32:49Z
7699,http://arxiv.org/abs/2504.02741v1,A Complete Classification of Fourier Summation...,We completely classify Fourier summation formu...,math.CA,"math.CA,math.MG,math.NT",2025-04-03T16:28:18Z
7700,http://arxiv.org/abs/2503.24223v1,Jordanian deformation of the non-compact and $...,"Using a Drinfeld twist of Jordanian type, we c...",hep-th,"hep-th,cond-mat.stat-mech,cond-mat.str-el,quan...",2025-03-31T15:38:04Z


# Data Cleaning

#### Dropping duplicate rows

In [4]:
## Drop duplicate rows from the dataset
df=df.drop_duplicates()
## Checking if some duplicates still remain based on "aid"(if the output is empty all duplicates have been removed)
value_counts_result = df["aid"].value_counts()
filtered_result = value_counts_result[value_counts_result > 1]
print(filtered_result)

Series([], Name: count, dtype: int64)


#### Dropping rows with empty values

In [5]:
df = df[df['title'].notnull() & df['summary'].notnull()]
df = df[df['categories'].notnull()]
df = df.reset_index(drop=True)

#### Removing subcategories after "."

In [6]:
df['main_category'] = df['main_category'].apply(lambda x: x.split('.')[0] if pd.notnull(x) else x)

#### Combining Title and Summary into one column

In [7]:
df['text'] = df['title'] + ' ' + df['summary']

#### Train test split

In [8]:
train, test = train_test_split(df[['text', 'main_category']], test_size=0.2, random_state=42)

#### Convert to spark Dataframe

In [9]:
spark = SparkSession.builder \
    .appName("CategoryPrediction") \
    .config("spark.executor.heartbeatInterval", "300s") \
    .config("spark.network.timeout", "600s") \
    .config("spark.sql.broadcastTimeout", "1200") \
    .config("spark.python.worker.timeout", "600") \
    .config("spark.driver.memory", "16g") \
    .getOrCreate() #attempt by chatgpt to fix error i get in second cell below this

train_df = spark.createDataFrame(train)
test_df = spark.createDataFrame(test)
train_df = train_df.repartition(4)

25/05/01 11:13:40 WARN Utils: Your hostname, Pins-MacBook-Pro.local resolves to a loopback address: 127.0.0.1; using 10.46.219.58 instead (on interface en0)
25/05/01 11:13:40 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/05/01 11:13:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [10]:
regex_tokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="\\W")
stopwords_remover = StopWordsRemover(inputCol="words", outputCol="filtered")
hashing_tf = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=1000)
idf = IDF(inputCol="rawFeatures", outputCol="features")

In [11]:
preprocessing_pipeline = Pipeline(stages=[regex_tokenizer, stopwords_remover, hashing_tf, idf])
preprocessor = preprocessing_pipeline.fit(train_df)

train_preprocessed = preprocessor.transform(train_df)
test_preprocessed = preprocessor.transform(test_df)

                                                                                

#### This was just to test whether tokenizer for example even works for a very small data example 

#### Model

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import numpy as np
import pandas as pd
from pyspark.sql import SparkSession

# assume train_preprocessed & test_preprocessed are your Spark DataFrames
spark = SparkSession.builder.appName("CategoryPrediction").getOrCreate()
train_pd = train_preprocessed.select("text", "main_category").toPandas()
test_pd = test_preprocessed.select("text", "main_category").toPandas()

# encode labels
label_encoder = LabelEncoder()
train_pd['label'] = label_encoder.fit_transform(train_pd['main_category'])
test_pd['label'] = label_encoder.transform(test_pd['main_category'])

# Hugging Face Dataset transformation
train_dataset = Dataset.from_pandas(train_pd[['text', 'label']])
test_dataset = Dataset.from_pandas(test_pd[['text', 'label']])

# Tokenizer
tokenizer = AutoTokenizer.from_pretrained("facebook/bart-large")

def preprocess_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=256)

train_dataset = train_dataset.map(preprocess_function, batched=True)
test_dataset = test_dataset.map(preprocess_function, batched=True)

# Model definition
num_labels = len(label_encoder.classes_)
model = AutoModelForSequenceClassification.from_pretrained("facebook/bart-large", num_labels=num_labels)

# Trainer parameters
training_args = TrainingArguments(
    output_dir="./results",
    #evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_strategy="epoch",  
    save_total_limit=1,     # keep the model of the latest epoch
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

# model training
trainer.train()

# **Model saving和 Tokenizer**
output_model_path = "./saved_model"
output_tokenizer_path = "./saved_tokenizer"

model.save_pretrained(output_model_path)
tokenizer.save_pretrained(output_tokenizer_path)

print(f"Model Saved to: {output_model_path}")
print(f"Tokenizer Saved to: {output_tokenizer_path}")


import joblib
output_label_encoder_path = "./label_encoder.joblib"
joblib.dump(label_encoder, output_label_encoder_path)
print(f"LabelEncoder Saved to: {output_label_encoder_path}")



In [12]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import numpy as np
from sklearn.metrics import classification_report
import joblib
from datasets import Dataset
import pandas as pd


# Spark DataFrames -> Pandas DataFrames
train_pd = train_preprocessed.select("text", "main_category").toPandas()
test_pd = test_preprocessed.select("text", "main_category").toPandas()

model_path = "./saved_model"
tokenizer_path = "./saved_tokenizer"
label_encoder_path = "./label_encoder.joblib"

# load model and tokenizer
loaded_model = AutoModelForSequenceClassification.from_pretrained(model_path).to("mps")
loaded_tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

# load LabelEncoder
loaded_label_encoder = joblib.load(label_encoder_path)
label_names = loaded_label_encoder.classes_

# transform to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_pd[['text', 'main_category']])

# Tokenize test data
encoded_test = loaded_tokenizer(test_dataset['text'], padding=True, truncation=True, return_tensors="pt")

# transfer the model to GPU( if applicable)
device = torch.device("cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu")
encoded_test = encoded_test.to(device)

# predicting
loaded_model.eval()

# create DataLoader
from torch.utils.data import DataLoader, TensorDataset

#  encoded_test -> TensorDataset
input_ids = encoded_test['input_ids']
attention_mask = encoded_test['attention_mask']
test_data = TensorDataset(input_ids, attention_mask)
test_loader = DataLoader(test_data, batch_size=4)  # adjust batch_size

all_predictions = []
all_true_labels = []

for batch in test_loader:
    batch = tuple(t.to(device) for t in batch)
    b_input_ids, b_attention_mask = batch

    with torch.no_grad():
        outputs = loaded_model(input_ids=b_input_ids, attention_mask=b_attention_mask)
        logits = outputs.logits

    predictions = torch.argmax(logits, dim=-1).cpu().numpy()
    all_predictions.extend(predictions)

y_pred_numeric = np.array(all_predictions)
y_true_numeric = loaded_label_encoder.transform(test_pd['main_category'])

# index label -> String
y_pred_labels = loaded_label_encoder.inverse_transform(y_pred_numeric)
y_true_labels = loaded_label_encoder.inverse_transform(y_true_numeric)

print(classification_report(y_true_labels, y_pred_labels, target_names=label_names))

You passed along `num_labels=3` with an incompatible id to label map: {'0': 'LABEL_0', '1': 'LABEL_1', '2': 'LABEL_2', '3': 'LABEL_3', '4': 'LABEL_4', '5': 'LABEL_5', '6': 'LABEL_6', '7': 'LABEL_7', '8': 'LABEL_8', '9': 'LABEL_9', '10': 'LABEL_10', '11': 'LABEL_11', '12': 'LABEL_12', '13': 'LABEL_13', '14': 'LABEL_14', '15': 'LABEL_15', '16': 'LABEL_16', '17': 'LABEL_17', '18': 'LABEL_18', '19': 'LABEL_19'}. The number of labels will be overwritten to 20.


              precision    recall  f1-score   support

    astro-ph       0.89      0.88      0.88        83
    cond-mat       0.83      0.74      0.78       125
          cs       0.94      0.89      0.91       624
        econ       0.43      0.38      0.40         8
        eess       0.52      0.68      0.59        63
       gr-qc       0.48      0.68      0.57        19
      hep-ex       0.67      0.75      0.71         8
     hep-lat       0.00      0.00      0.00         8
      hep-ph       0.74      0.85      0.79        34
      hep-th       0.57      0.50      0.53        24
        math       0.83      0.87      0.85       261
     math-ph       0.00      0.00      0.00         7
        nlin       0.00      0.00      0.00         6
     nucl-ex       0.00      0.00      0.00         5
     nucl-th       0.60      0.86      0.71         7
     physics       0.67      0.67      0.67        90
       q-bio       0.50      0.70      0.58        10
       q-fin       0.50    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


#### Clean text (Ignore)

In [None]:
"""def clean_text(text):
    if isinstance(text, str):
        # Replace newlines with spaces
        text = text.replace('\n', ' ')
        # Remove extra whitespace
        text = ' '.join(text.split())
        return text.lower() #lowercase
    return text"""

In [None]:
"""df['text'] = df['text'].apply(clean_text)
df"""

In [None]:
"""from sklearn.preprocessing import LabelEncoder
# 5. Encode categories
le = LabelEncoder()
df['label'] = le.fit_transform(df['main_category'])

# 6. Save cleaned data
clean_df = df[['text', 'label', 'main_category']]
clean_df.to_csv("cleaned_arxiv_data.csv", index=False)

# Class distribution
print("\nClass distribution:")
print(clean_df['main_category'].value_counts())
# Label mapping
print("\nLabel mapping:")
for idx, cat in enumerate(le.classes_):
    print(f"{idx}: {cat}")"""

#### Ignore 

In [None]:
from transformers import pipeline

classifier = pipeline("zero-shot-classification",
                      model="facebook/bart-large-mnli")

In [None]:
import torch
print(torch.cuda.is_available())

In [None]:
print(torch.version.cuda)

In [None]:
print(torch.cuda.get_device_name(0))