Multi-Label Classification with Longformer

In [4]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [6]:
%cd /content/drive/MyDrive/MasterCourse/Multilabel Effects

/content/drive/MyDrive/MasterCourse/Multilabel Effects


In [7]:
import pandas as pd
# Load data
df = pd.read_csv("cleaned_experience_reports.csv")
df.head()

Unnamed: 0,title,report_text,effects_terms,token_length
0,Experience:1050 Âµg 1cP-LSD - The matrix,After 1P-LSD and other LSD derivatives were pr...,"['brightness alteration', 'cognitive disconnec...",628
1,Experience:26mg - Stage 3 Trip,"Walking to a friends house, I popped a gel cap...","['anxiety', 'geometry', 'nausea', 'time distor...",461
2,Experience:26mg - I begged the shroom aliens t...,I took 26mg of 4-aco-dmt alone in my bedroom a...,"['anxiety', 'autonomous entities', 'internal h...",722
3,Experience:25mg (insufflated) - Simultaneously...,Sometimes people who should know better do stu...,"['3 dimensional textures', 'autonomous entitie...",1905
4,Experience:25mg - A labyrinth of organs and a ...,"Me and my best friend are 18, the antisocial n...","['and sociability enhancement', 'brightness al...",927


In [8]:
df.shape

(231, 4)

In [9]:
from collections import Counter

# count frequencies
all_effects = [effect for effects in df['effects_terms'] for effect in effects]
effect_counts = Counter(all_effects)

# Total unique effects before filtering
print(f"Total unique effect labels (before filtering): {len(effect_counts)}")

# count rare effects (appear in < 5 reports)
threshold = 10
rare_effects = [effect for effect, count in effect_counts.items() if count < threshold]

print(f"Rare effects (<{threshold} uses): {len(rare_effects)} found")

Total unique effect labels (before filtering): 43
Rare effects (<10 uses): 12 found


In [11]:
import ast

# If not already done — convert stringified lists to actual lists
df['effects_terms'] = df['effects_terms'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Step 1: Filter out rare effects
df['filtered_effects'] = df['effects_terms'].apply(
    lambda effects: [e for e in effects if e not in rare_effects]
)

# Step 2: Drop rows with no remaining effects
before_drop = len(df)
df = df[df['filtered_effects'].map(len) > 0].reset_index(drop=True)
after_drop = len(df)

print(f"Dropped {before_drop - after_drop} rows with no valid effects")
print(f"Final dataset size: {after_drop} rows")

Dropped 0 rows with no valid effects
Final dataset size: 231 rows


In [12]:
df.shape

(231, 5)

In [13]:
import re

def clean_label(label):
    label = label.strip().lower().replace('_', ' ').replace('-', ' ')
    label = re.sub(r'\s+', ' ', label)
    return label

# Ensure effects_terms are listified
import ast
df['effects_terms'] = df['effects_terms'].apply(
    lambda x: ast.literal_eval(x) if isinstance(x, str) else x
)

# Apply cleaning
df['effects_terms'] = df['effects_terms'].apply(
    lambda labels: sorted(list(set(clean_label(l) for l in labels if l and len(l) < 50 and len(l.split()) < 8)))
)


In [14]:
# Recount frequencies after cleaning
from collections import Counter

all_effects = [effect for effects in df['effects_terms'] for effect in effects]
effect_counts = Counter(all_effects)

threshold = 30
rare_effects = [effect for effect, count in effect_counts.items() if count < threshold]
print(f"❌ Rare effects (<{threshold} uses): {len(rare_effects)} found")

# Filter + drop empty
df['filtered_effects'] = df['effects_terms'].apply(
    lambda effects: [e for e in effects if e not in rare_effects]
)
df = df[df['filtered_effects'].map(len) > 0].reset_index(drop=True)

# Count again
from itertools import chain
unique_filtered_effects = sorted(set(chain.from_iterable(df['filtered_effects'])))
print(f"🎯 Total unique filtered effect labels: {len(unique_filtered_effects)}")

❌ Rare effects (<30 uses): 264 found
🎯 Total unique filtered effect labels: 19


In [15]:
df.shape

(222, 5)

In [40]:
df.head()

Unnamed: 0,title,report_text,effects_terms,token_length,filtered_effects
0,Experience:1050 Âµg 1cP-LSD - The matrix,After 1P-LSD and other LSD derivatives were pr...,"[brightness alteration, cognitive disconnectio...",628,"[drifting, external hallucination, geometry, m..."
1,Experience:26mg - Stage 3 Trip,"Walking to a friends house, I popped a gel cap...","[anxiety, geometry, nausea, time distortion, v...",461,"[anxiety, geometry, nausea, time distortion]"
2,Experience:26mg - I begged the shroom aliens t...,I took 26mg of 4-aco-dmt alone in my bedroom a...,"[anxiety, autonomous entities, internal halluc...",722,"[anxiety, autonomous entities, internal halluc..."
3,Experience:25mg (insufflated) - Simultaneously...,Sometimes people who should know better do stu...,"[3 dimensional textures, autonomous entities, ...",1905,"[autonomous entities, drifting, internal hallu..."
4,Experience:25mg - A labyrinth of organs and a ...,"Me and my best friend are 18, the antisocial n...","[and sociability enhancement, brightness alter...",927,"[cognitive euphoria, nausea, physical euphoria..."


In [16]:
# Check missing values across all columns
print(df.isnull().sum())

# Specifically check in 'filtered_effects' and 'report_text'
print("\nMissing in 'filtered_effects':", df['filtered_effects'].isnull().sum())
print("Missing in 'report_text':", df['report_text'].isnull().sum())

title                0
report_text         12
effects_terms        0
token_length         0
filtered_effects     0
dtype: int64

Missing in 'filtered_effects': 0
Missing in 'report_text': 12


In [17]:
# Drop rows where filtered_effects or report_text is missing
df = df[df['filtered_effects'].notnull() & df['report_text'].notnull()].reset_index(drop=True)
print(f"✅ Final dataset shape after dropping missing: {df.shape}")

✅ Final dataset shape after dropping missing: (210, 5)


In [18]:
# Check missing values across all columns
print(df.isnull().sum())

title               0
report_text         0
effects_terms       0
token_length        0
filtered_effects    0
dtype: int64


In [19]:
# Step 1: Build label-to-index mapping
from collections import defaultdict

final_labels = sorted(set(effect for row in df['filtered_effects'] for effect in row))
effect2idx = {effect: idx for idx, effect in enumerate(final_labels)}
print(f"✅ Final effect2idx dictionary created with {len(effect2idx)} labels.")

# Step 2: Create multi-hot vectors
def make_multihot(effects):
    vector = [0] * len(effect2idx)
    for effect in effects:
        vector[effect2idx[effect]] = 1
    return vector

df['effect_cat_list'] = df['filtered_effects'].apply(make_multihot)

✅ Final effect2idx dictionary created with 19 labels.


In [20]:
df[['filtered_effects', 'effect_cat_list']].head()

Unnamed: 0,filtered_effects,effect_cat_list
0,"[drifting, external hallucination, geometry, m...","[0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, ..."
1,"[anxiety, geometry, nausea, time distortion]","[1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, ..."
2,"[anxiety, autonomous entities, internal halluc...","[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, ..."
3,"[autonomous entities, drifting, internal hallu...","[0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, ..."
4,"[cognitive euphoria, nausea, physical euphoria...","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, ..."


In [21]:
# Use only the retained labels in final effect counts
filtered_effect_counts = {label: count for label, count in effect_counts.items() if label in final_labels}

# Make label-to-index dict
encode_effect_types = { key: idx for idx, (key, value) in enumerate(filtered_effect_counts.items()) }

# Save to JSON
import json
with open("effect_types_encoded.json", "w") as fp:
    json.dump(encode_effect_types, fp)

print("📁 effect_types_encoded.json saved successfully.")


📁 effect_types_encoded.json saved successfully.


In [22]:
# Save to CSV (optional: add timestamp or threshold info)
df.to_csv("final_cleaned_experience_reports.csv", index=False)

print("📁 Dataset saved as final_cleaned_experience_reports.csv")

📁 Dataset saved as final_cleaned_experience_reports.csv
