In [25]:
!pip install -U "huggingface_hub[cli]"
!pip install transformers datasets scikit-learn pandas torch
!pip install simpletransformers




# **imports and initial steps**

In [26]:
import pandas as pd
import nltk
import os # for movies folders

try:
  nltk.data.find('tokenizers/punkt')
  nltk.data.find('tokenizers/punkt_tab')
except LookupError:
  nltk.download('punkt')
  nltk.download('punkt_tab')

def segment_into_sentences(text):
  if pd.isna(text) or text.strip() == "":
    return []
  return nltk.sent_tokenize(str(text))

all_sentences_and_labels = [] # store final dataframe

# **PROCESSING BOOKS**

In [27]:
books_csv_path = "/Users/augustincoman/University/Text Mining/final project/archive-3/book-reviews.csv"
book_text_column = 'ReviewContent'

try:
  df_books = pd.read_csv(books_csv_path, encoding='latin1')
  print(f"loaded book reviews from: {books_csv_path}")

  for index, row in df_books.iterrows():
    # print("entered print")
    review_text = row[book_text_column]
    # print("got review text")
    sentences = segment_into_sentences(review_text)
    # print("got sentences")
    for sentence in sentences:
      # print(" entered inner for loop")
      all_sentences_and_labels.append({'text': sentence, 'label': 'book'})
      # print("added text to df")
  print(f"processed {len(df_books)} book reviews.")

except FileNotFoundError:
  print(f"file not found: {books_csv_path}")
except KeyError:
  print(f"column not found: {book_text_column}")
except Exception as e:
  print(f"error: {e}")

loaded book reviews from: /Users/augustincoman/University/Text Mining/final project/archive-3/book-reviews.csv
processed 5000 book reviews.


# **PROCESSING MOVIES**

In [28]:
base_movie_review_folder_path = "/Users/augustincoman/University/Text Mining/final project/movie-training-set-polarity"
review_folders = ['pos', 'neg']

movie_reviews_processed_count = 0
for folder_name in review_folders:
  folder_path = os.path.join(base_movie_review_folder_path, folder_name)
  if not os.path.isdir(folder_path):
      print(f"Warning: Movie review folder not found: {folder_path}")
      continue

  print(f"Processing movie reviews from: {folder_path}")
  for filename in os.listdir(folder_path):
      if filename.endswith(".txt"):
          file_path = os.path.join(folder_path, filename)
          try:
              with open(file_path, 'r', encoding='utf-8') as f:
                  review_text = f.read()

              sentences = segment_into_sentences(review_text)
              for sentence in sentences:
                  all_sentences_and_labels.append({'text': sentence, 'label': 'movie'})
              movie_reviews_processed_count += 1
          except Exception as e:
              print(f"Error processing file {file_path}: {e}")
print(f"Processed {movie_reviews_processed_count} movie reviews.")

Processing movie reviews from: /Users/augustincoman/University/Text Mining/final project/movie-training-set-polarity/pos
Processing movie reviews from: /Users/augustincoman/University/Text Mining/final project/movie-training-set-polarity/neg
Processed 2000 movie reviews.


# **PROCESSING SPORTS ARTICLES**

In [29]:
sports_data_hf_path = "hf://datasets/datadreamer-dev/cnn_dailymail_sports/data/train-00000-of-00001.parquet"

try:
    print(f"Loading sports data from: {sports_data_hf_path}")
    df_sports = pd.read_parquet(sports_data_hf_path)
    print("Successfully loaded sports data.")

    print(f"Columns in sports dataset: {df_sports.columns.tolist()}")

    sports_text_column = 'article'

    if sports_text_column not in df_sports.columns:
        print(f"ERROR: Column '{sports_text_column}' not found in the sports DataFrame.")
        print(f"Please inspect the columns printed above and set 'sports_text_column' correctly.")
    else:
        processed_sports_entries = 0
        for index, row in df_sports.iterrows():
            article_text = row[sports_text_column]
            sentences = segment_into_sentences(article_text)
            for sentence in sentences:
                all_sentences_and_labels.append({'text': sentence, 'label': 'sports'})
            processed_sports_entries += 1
        print(f"Processed {processed_sports_entries} sports articles/entries.")

except ImportError:
    print("ERROR: 'pyarrow' or 'fastparquet' might be needed to read parquet files.")
    print("Please install it if you haven't: !pip install pyarrow fastparquet")
except Exception as e:
    print(f"An error occurred while loading or processing sports data: {e}")


Loading sports data from: hf://datasets/datadreamer-dev/cnn_dailymail_sports/data/train-00000-of-00001.parquet
Successfully loaded sports data.
Columns in sports dataset: ['article', 'highlights', 'id']
Processed 47 sports articles/entries.


# **CREATING PANDAS DATAFRAME**

In [30]:
if not all_sentences_and_labels:
    print("No data was processed. The list 'all_sentences_and_labels' is empty.")
    print("Please check the file paths and processing logic for your datasets.")
    final_df = pd.DataFrame(columns=['text', 'label']) # create empty DataFrame
else:
    final_df = pd.DataFrame(all_sentences_and_labels)

    # suffle dataframe
    final_df = final_df.sample(frac=1, random_state=42).reset_index(drop=True)

    print("\n--- Combined Dataset ---")
    print(f"Total sentences collected: {len(final_df)}")
    if not final_df.empty:
        print("Label distribution:")
        print(final_df['label'].value_counts())
        print("\nFirst 5 rows of the combined dataset:")
        print(final_df.head())
    else:
        print("The final DataFrame is empty.")


--- Combined Dataset ---
Total sentences collected: 99243
Label distribution:
label
movie     71532
book      26305
sports     1406
Name: count, dtype: int64

First 5 rows of the combined dataset:
                                                text  label
0  the most consistently effective gag revolves a...  movie
1            Glad they didn't sugar coat the ending.   book
2  in essence , the entire segment works like a t...  movie
3  it basically starts off with keanu uploading i...  movie
4  i was hoping for a little more out of jaw mohr...  movie


# **PREP DATAFRAME**

In [31]:
from sklearn.model_selection import train_test_split

# check for final_df
if 'final_df' not in globals() or final_df.empty:
    print("Error: 'final_df' is not defined or is empty. Please complete Step 2 first.")
else:
    print("Original final_df head:")
    print(final_df.head())
    print(f"\nOriginal label counts:\n{final_df['label'].value_counts()}")

    # map label names
    unique_labels_list = final_df['label'].unique().tolist()
    label2id = {label: i for i, label in enumerate(unique_labels_list)}
    id2label = {i: label for i, label in enumerate(unique_labels_list)}
    num_labels = len(unique_labels_list)

    print("\nLabel to ID mapping:", label2id)
    print("ID to Label mapping:", id2label)
    print(f"Number of unique labels: {num_labels}")

    final_df['labels'] = final_df['label'].map(label2id) # Creates the numerical 'labels' column

    df_for_bert = final_df[['text', 'labels']].copy()

    print("\nDataFrame prepared for simpletransformers (first 5 rows):")
    print(df_for_bert.head())

Original final_df head:
                                                text  label
0  the most consistently effective gag revolves a...  movie
1            Glad they didn't sugar coat the ending.   book
2  in essence , the entire segment works like a t...  movie
3  it basically starts off with keanu uploading i...  movie
4  i was hoping for a little more out of jaw mohr...  movie

Original label counts:
label
movie     71532
book      26305
sports     1406
Name: count, dtype: int64

Label to ID mapping: {'movie': 0, 'book': 1, 'sports': 2}
ID to Label mapping: {0: 'movie', 1: 'book', 2: 'sports'}
Number of unique labels: 3

DataFrame prepared for simpletransformers (first 5 rows):
                                                text  labels
0  the most consistently effective gag revolves a...       0
1            Glad they didn't sugar coat the ending.       1
2  in essence , the entire segment works like a t...       0
3  it basically starts off with keanu uploading i...       0
4  i

In [32]:
if 'df_for_bert' in globals() and not df_for_bert.empty:
    # split into training and eval
    train_df, eval_df = train_test_split(
        df_for_bert,
        test_size=0.15,
        random_state=42,
        stratify=df_for_bert['labels']
    )

    print(f"\nTraining DataFrame shape: {train_df.shape}")
    print(f"Evaluation DataFrame shape: {eval_df.shape}")

    print("\nTraining DataFrame label distribution:")
    print(train_df['labels'].value_counts())
    print("\nEvaluation DataFrame label distribution:")
    print(eval_df['labels'].value_counts())
else:
    print("\nSkipping train/eval split as df_for_bert is not ready.")


Training DataFrame shape: (84356, 2)
Evaluation DataFrame shape: (14887, 2)

Training DataFrame label distribution:
labels
0    60802
1    22359
2     1195
Name: count, dtype: int64

Evaluation DataFrame label distribution:
labels
0    10730
1     3946
2      211
Name: count, dtype: int64


# **STEP 4 MODEL CONFIGURATION**

In [33]:
from simpletransformers.classification import ClassificationModel, ClassificationArgs
import pandas as pd
import sklearn
import torch

import logging
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.WARNING)

In [34]:
model_args = ClassificationArgs()

# main args
model_args.num_train_epochs = 3  # train epochs
model_args.train_batch_size = 16 # batch size
model_args.learning_rate = 4e-5  # learn rate
model_args.max_seq_length = 256  # max seq len

# eval and saving
model_args.evaluate_during_training = True # eval while training
model_args.evaluate_during_training_steps = 100 # eval every 100 steps
model_args.overwrite_output_dir = True    # overwrite in output dir
model_args.output_dir = "outputs/"        # dir to save
model_args.best_model_dir = "outputs/best_model/" # special dir for best model

# early stopping
model_args.use_early_stopping = True
model_args.early_stopping_delta = 0.01
model_args.early_stopping_metric = 'eval_loss'
model_args.early_stopping_metric_minimize = True
model_args.early_stopping_patience = 3 # nr of evals for improving

print("Model Arguments:")
print(model_args)

Model Arguments:
ClassificationArgs(adafactor_beta1=None, adafactor_clip_threshold=1.0, adafactor_decay_rate=-0.8, adafactor_eps=(1e-30, 0.001), adafactor_relative_step=True, adafactor_scale_parameter=True, adafactor_warmup_init=True, adam_betas=(0.9, 0.999), adam_epsilon=1e-08, best_model_dir='outputs/best_model/', cache_dir='cache_dir/', config={}, cosine_schedule_num_cycles=0.5, custom_layer_parameters=[], custom_parameter_groups=[], dataloader_num_workers=0, do_lower_case=False, dynamic_quantize=False, early_stopping_consider_epochs=False, early_stopping_delta=0.01, early_stopping_metric='eval_loss', early_stopping_metric_minimize=True, early_stopping_patience=3, encoding=None, eval_batch_size=100, evaluate_during_training=True, evaluate_during_training_silent=True, evaluate_during_training_steps=100, evaluate_during_training_verbose=False, evaluate_each_epoch=True, fp16=True, gradient_accumulation_steps=1, learning_rate=4e-05, local_rank=-1, logging_steps=50, loss_type=None, loss_

In [35]:
# check for GPU
use_cuda = torch.cuda.is_available()
if use_cuda:
    print("GPU is available. Setting use_cuda=True.")
else:
    print("GPU not available. Setting use_cuda=False.")

if 'num_labels' not in globals():
    print("Error: 'num_labels' is not defined. Please ensure Step 3 (Data Preparation) was completed successfully.")
else:
    print(f"Number of labels for the model: {num_labels}")
    try:
        model = ClassificationModel(
            model_type='bert',
            model_name='bert-base-uncased', # model usde
            num_labels=num_labels,
            args=model_args,
            use_cuda=use_cuda # using GPU
        )
        print("\nClassificationModel initialized successfully.")
    except Exception as e:
        print(f"Error initializing ClassificationModel: {e}")

GPU not available. Setting use_cuda=False.
Number of labels for the model: 3


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



ClassificationModel initialized successfully.


# **TRAINING**

In [36]:
if 'model' in globals() and 'train_df' in globals() and 'eval_df' in globals():
    print("Starting model training...")

    _, training_history = model.train_model(
        train_df,
        eval_df=eval_df
    )

    print("Model training complete.")
    print("Training history (includes training and evaluation losses per logging step):")

    if hasattr(model, 'results'):
        print("Evaluation results during training:")
        print(model.results)

else:
    print("Error: 'model', 'train_df', or 'eval_df' is not defined. Please complete previous steps.")

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.


Starting model training...


  0%|          | 0/168 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling 

Model training complete.
Training history (includes training and evaluation losses per logging step):
Evaluation results during training:
{'mcc': 0.8801801371878566, 'eval_loss': 0.16240851521091973}


In [37]:
import pandas as pd
from sklearn.metrics import classification_report

if 'best_model' not in globals():
    print("Warning: 'best_model' not loaded. Attempting to use the 'model' from the training step.")
    if 'model' in globals():
        eval_model_for_test = model
    else:
        print("Error: No trained model available for test set evaluation.")
        eval_model_for_test = None 
else:
    eval_model_for_test = best_model

# test set
test_set_path = 'sentiment-topic-test.tsv' 

if eval_model_for_test and 'label2id' in globals() and 'id2label' in globals() and 'num_labels' in globals():
    try:
        df_official_test = pd.read_csv(test_set_path, sep='\t')
        print(f"\nSuccessfully loaded official test set from: {test_set_path}")
        print(f"Official test set shape: {df_official_test.shape}")
        print(df_official_test.head()) 

        test_sentences = df_official_test['sentence'].tolist()
        true_topic_strings = df_official_test['topic'].tolist()

        # convert true topic strings to numerical IDs using the SAME label2id from training
        true_numerical_labels = [label2id[label] for label in true_topic_strings]

        print("\nPredicting on official test set...")
        predicted_numerical_labels, raw_outputs = eval_model_for_test.predict(test_sentences)
        
        print("\nClassification Report on Official Test Set:")
        # making sure target names are in correct order for nr lables
        target_names_ordered = [id2label[i] for i in range(num_labels)]
        print(classification_report(true_numerical_labels, predicted_numerical_labels, target_names=target_names_ordered))

        # add the predictions back to the DataFrame for analysis
        df_official_test['predicted_numerical_label'] = predicted_numerical_labels
        df_official_test['predicted_topic_string'] = [id2label[pred] for pred in predicted_numerical_labels]
        print("\nOfficial test set with predictions (first 5 rows):")
        print(df_official_test.head())

    except FileNotFoundError:
        print(f"ERROR: Official test file not found at {test_set_path}")
    except KeyError as e:
        print(f"ERROR: A required column is missing or a label in the test set was not in the training data's label2id map: {e}")
    except Exception as e:
        print(f"An error occurred during official test set evaluation: {e}")
else:
    print("\nError: Model or label mappings not ready for official test set evaluation.")

INFO:simpletransformers.classification.classification_utils: Converting to features started. Cache is not used.



Successfully loaded official test set from: sentiment-topic-test.tsv
Official test set shape: (18, 4)
   sentence_id                                           sentence sentiment  \
0            0  The stadium was alive with the roar of the cro...  positive   
1            1  That last-minute goal had me jumping out of my...  positive   
2            2  I couldn’t put the book down; it swept me into...  positive   
3            3  The story had its moments, though some parts f...   neutral   
4            4  I enjoyed the way the timelines shifted, even ...   neutral   

    topic  
0  sports  
1  sports  
2    book  
3    book  
4    book  

Predicting on official test set...


0it [00:00, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
1it [00:05,  5.10s/it]
100%|██████████| 1/1 [00:00<00:00,  1.12it/s]


Classification Report on Official Test Set:
              precision    recall  f1-score   support

       movie       0.75      1.00      0.86         6
        book       1.00      1.00      1.00         6
      sports       1.00      0.67      0.80         6

    accuracy                           0.89        18
   macro avg       0.92      0.89      0.89        18
weighted avg       0.92      0.89      0.89        18


Official test set with predictions (first 5 rows):
   sentence_id                                           sentence sentiment  \
0            0  The stadium was alive with the roar of the cro...  positive   
1            1  That last-minute goal had me jumping out of my...  positive   
2            2  I couldn’t put the book down; it swept me into...  positive   
3            3  The story had its moments, though some parts f...   neutral   
4            4  I enjoyed the way the timelines shifted, even ...   neutral   

    topic  predicted_numerical_label predicted_


