In [6]:
!pip install scikit-multilearn



In [17]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.adapt import MLkNN
from sklearn.metrics import hamming_loss, accuracy_score
from sklearn.neighbors import NearestNeighbors
from sklearn.neighbors import KNeighborsClassifier

In [8]:
# Utility: parse the tag id string from works CSV into a list of tag id strings
import pandas as _pd

def parse_tag_string(s):
    if _pd.isna(s):
        return []
    parts = [p for p in str(s).split('+') if p != '']
    return parts

# Optional helper to inspect NearestNeighbors (debugging shadowing issues)
def debug_nearestneighbors():
    print('scikit-learn version:', sklearn.__version__)
    print('scikit-multilearn version:', skmultilearn.__version__)
    print('NearestNeighbors object:', NearestNeighbors)
    try:
        print('NearestNeighbors.__init__ signature:', inspect.signature(NearestNeighbors.__init__))
    except Exception as e:
        print('Could not get signature:', e)

In [9]:
def train_mknn(X, Y, k=5):
    """
    X: np.ndarray (num_works, num_features)
    Y: np.ndarray (num_works, num_tags) â€” each row is 0/1 per tag
    """
    knn = NearestNeighbors(n_neighbors=k, metric="cosine")
    knn.fit(X)
    return knn, Y


def predict_tags_mknn(knn, Y, word_count_vector, tags_work, threshold=0.2):
    """
    knn: fitted NearestNeighbors model
    Y: label matrix from training (num_works, num_tags)
    word_count_vector: np.ndarray (num_features,)
    tags_work: list of strings, tag names
    threshold: minimum probability for a tag to be predicted
    """

    # Find nearest neighbors
    distances, indices = knn.kneighbors([word_count_vector])

    # Gather neighbor tag vectors
    neighbor_tags = Y[indices[0]]  # shape: (k, num_tags)

    # Compute average label presence (freq among k neighbors)
    tag_scores = neighbor_tags.mean(axis=0)

    # Choose tags above threshold
    predicted = [
        tags_work[i]
        for i, score in enumerate(tag_scores)
        if score >= threshold
    ]

    return predicted, tag_scores

In [10]:
#Description: This will be an AI project focused on generating potential tags for AO3 based fanfiction
aspects_df = pd.read_csv('tags-20210226.csv')
aspects_df['name'] = aspects_df['name'].astype(str)
id = aspects_df["id"]
tags_name = aspects_df["name"]
aspects_df2 = pd.read_csv('works-20210226.csv')
aspects_df2["tag_id_list"] = aspects_df2["tags"].apply(parse_tag_string)
tag_id_to_name = dict(zip(id.astype(str), tags_name))
def ids_to_names(id_list):
    return [tag_id_to_name[i] for i in id_list if i in tag_id_to_name]

aspects_df2["tag_name_list"] = aspects_df2["tag_id_list"].apply(ids_to_names)

aspects_df2["tag_name_list"] = aspects_df2["tag_name_list"].apply(lambda lst: ",".join(lst))
tags_work = aspects_df2["tag_name_list"]
#tags_work = tags_work.astype("string")
word_count = aspects_df2["word_count"]
#word_count = word_count.astype("string")
print(tags_work)

1      Explicit,Dubious Consent,Rimming,Dealfic,M/M,N...
2      Explicit,Star Trek,Star Trek: The Original Ser...
3      Avatar: The Last Airbender,Alternate Universe,...
4      Teen And Up Audiences,F/M,Gen,Graphic Depictio...
                             ...                        
994    Teen And Up Audiences,Hurt/Comfort,M/M,Choose ...
995    Dream,Gen,Graphic Depictions Of Violence,Teen ...
996    Teen And Up Audiences,Dream,M/M,Choose Not To ...
997    Teen And Up Audiences,Dream,Gen,Graphic Depict...
998    Mature,Hurt/Comfort,Romance,M/M,No Archive War...
Name: tag_name_list, Length: 999, dtype: object


In [11]:
from sklearn.preprocessing import MultiLabelBinarizer

# Create a multi-label binarizer for tags
mlb = MultiLabelBinarizer()
y_tags = mlb.fit_transform([tags.split(",") for tags in tags_work])

# Use word_count as features (or use TF-IDF of tag text descriptions if available)
# For now, we'll create a simple feature from word_count
X_features = np.column_stack([word_count.values])  # Can add more features here

# Split the data
x_train, x_test, y_train, y_test = train_test_split(X_features, y_tags, test_size=0.30, random_state=42)


In [19]:
# Debug check to detect shadowing or incompatible versions
#debug_nearestneighbors()

# Train MLkNN - ensure inputs are the right shapes and types
mlknn_classifier = KNeighborsClassifier(n_neighbors=5)
mlknn_classifier.fit(x_train, y_train)
# Predict on test set and show a simple metric
y_pred = mlknn_classifier.predict(x_test)
print('Hamming loss:', hamming_loss(y_test, y_pred))

Hamming loss: 0.016994871794871795
