In [1]:
import csv
import random

def generate_fictional_piece_names(num_pieces=100000):
    genres = ["Concerto", "Symphony", "Sonata", "Suite", "Overture", "Opera", "Quartet", "Trio", "Cantata", "Etude", "Prelude", "Rhapsody", "Serenade", "Nocturne"]
    instruments = ["Piano", "Violin", "Cello", "Flute", "Clarinet", "Harp", "Guitar", "Oboe", "Bassoon", "Horn", "Trumpet", "Trombone"]
    keys = ["C Major", "G Major", "D Major", "A Major", "E Major", "B Major", "F Major", "B-Flat Major", "E-Flat Major", "A-Flat Major", "D-Flat Major", "G-Flat Major", 
            "C Minor", "G Minor", "D Minor", "A Minor", "E Minor", "B Minor", "F Minor", "B-Flat Minor", "E-Flat Minor", "A-Flat Minor", "D-Flat Minor", "G-Flat Minor"]
    opus_numbers = [f"Op. {i}" for i in range(1, 201)]
    k_numbers = [f"K. {i}" for i in range(1, 501)]
    
    piece_names = []
    
    for _ in range(num_pieces):
        genre = random.choice(genres)
        instrument = random.choice(instruments)
        number = random.randint(1, 30)
        key = random.choice(keys)
        opus = random.choice(opus_numbers)
        k_number = random.choice(k_numbers)
        
        piece_name = f"{instrument} {genre} No. {number} in {key}, {opus}, {k_number}"
        piece_names.append(piece_name)
    
    return piece_names

def generate_variations(piece_name):
    parts = piece_name.split(", ")
    title = parts[0]
    details = parts[1:] if len(parts) > 1 else []
    
    variations = set()
    variations.add(piece_name)  # Original full name

    if len(details) == 2:
        # Drop either opus or K number, not both
        variations.add(f"{title}, {details[1]}")  # Drop opus number
        variations.add(f"{title}, {details[0]}")  # Drop K number
        variations.add(f"{title} ({details[0]})")  # Title with opus number in brackets
        variations.add(f"{details[0]}, {title}")  # Opus number first, then title
        variations.add(f"{details[1]}, {title}")  # K number first, then title
        variations.add(f"{title}")  # Only title
    elif len(details) == 1:
        variations.add(f"{details[0]}, {title}")  # Reorder
        variations.add(f"{title}")  # Only title
        variations.add(f"{details[0]}")  # Only details part

    return list(variations)

def generate_csv(piece_names, filename="classical_pieces.csv"):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Piece Name 1", "Piece Name 2"])
        count = 0
        while count < 200000:
            piece_name = random.choice(piece_names)
            variations = generate_variations(piece_name)
            if len(variations) > 1:
                piece_pair = random.sample(variations, 2)
                # Ensure neither of the names is just an opus or K number alone
                if not (piece_pair[0].startswith("Op.") or piece_pair[0].startswith("K.")) and not (piece_pair[1].startswith("Op.") or piece_pair[1].startswith("K.")):
                    writer.writerow(piece_pair)
                    count += 1

piece_names = generate_fictional_piece_names()
generate_csv(piece_names)


In [2]:
import csv
import random

def generate_fictional_piece_names(num_pieces=100000):
    genres = ["Concerto", "Symphony", "Sonata", "Suite", "Overture", "Opera", "Quartet", "Trio", "Cantata", "Etude", "Prelude", "Rhapsody", "Serenade", "Nocturne"]
    instruments = ["Piano", "Violin", "Cello", "Flute", "Clarinet", "Harp", "Guitar", "Oboe", "Bassoon", "Horn", "Trumpet", "Trombone"]
    keys = ["C Major", "G Major", "D Major", "A Major", "E Major", "B Major", "F Major", "B-Flat Major", "E-Flat Major", "A-Flat Major", "D-Flat Major", "G-Flat Major", 
            "C Minor", "G Minor", "D Minor", "A Minor", "E Minor", "B Minor", "F Minor", "B-Flat Minor", "E-Flat Minor", "A-Flat Minor", "D-Flat Minor", "G-Flat Minor"]
    opus_numbers = [f"Op. {i}" for i in range(1, 201)]
    k_numbers = [f"K. {i}" for i in range(1, 501)]
    
    piece_names = []
    
    for _ in range(num_pieces):
        genre = random.choice(genres)
        instrument = random.choice(instruments)
        number = random.randint(1, 30)
        key = random.choice(keys)
        opus = random.choice(opus_numbers)
        k_number = random.choice(k_numbers)
        
        piece_name = f"{instrument} {genre} No. {number} in {key}, {opus}, {k_number}"
        piece_names.append(piece_name)
    
    return piece_names

def generate_similar_but_different_pairs(piece_names, num_pairs=200000):
    keys = ["C Major", "G Major", "D Major", "A Major", "E Major", "B Major", "F Major", "B-Flat Major", "E-Flat Major", "A-Flat Major", "D-Flat Major", "G-Flat Major", 
            "C Minor", "G Minor", "D Minor", "A Minor", "E Minor", "B Minor", "F Minor", "B-Flat Minor", "E-Flat Minor", "A-Flat Minor", "D-Flat Minor", "G-Flat Minor"]
    opus_numbers = [f"Op. {i}" for i in range(1, 201)]
    k_numbers = [f"K. {i}" for i in range(1, 501)]
    
    pairs = []
    for _ in range(num_pairs):
        piece_name = random.choice(piece_names)
        
        # Split the piece name into components
        parts = piece_name.split(", ")
        title_parts = parts[0].split(" ")
        
        # Ensure changing the number after the genre
        number_index = title_parts.index("No.") + 1  # Index of the number after "No."
        new_number = random.randint(1, 30)
        while new_number == int(title_parts[number_index]):
            new_number = random.randint(1, 30)
        title_parts[number_index] = str(new_number)

        # Optionally, make additional changes to ensure diversity
        change_attribute = random.choice(["key", "opus", "k_number"])
        
        if change_attribute == "key":
            key_index = title_parts.index("in") + 1  # Index of the key after "in"
            new_key = random.choice(keys)
            while new_key == " ".join(title_parts[key_index:key_index + 2]):
                new_key = random.choice(keys)
            title_parts[key_index:key_index + 2] = new_key.split()
        
        elif change_attribute == "opus":
            new_opus = random.choice(opus_numbers)
            while new_opus == parts[1]:
                new_opus = random.choice(opus_numbers)
            parts[1] = new_opus
        
        elif change_attribute == "k_number":
            new_k_number = random.choice(k_numbers)
            while new_k_number == parts[2]:
                new_k_number = random.choice(k_numbers)
            parts[2] = new_k_number
        
        # Reconstruct the similar but different piece name
        similar_piece_name = " ".join(title_parts)
        similar_piece_full_name = f"{similar_piece_name}, {parts[1]}, {parts[2]}"
        
        pairs.append((piece_name, similar_piece_full_name))
    
    return pairs

def generate_csv(pairs, filename="different_classical_pieces.csv"):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Piece Name 1", "Piece Name 2"])
        writer.writerows(pairs)

# Generate the piece names
piece_names = generate_fictional_piece_names()

# Generate the pairs of similar but different pieces
pairs = generate_similar_but_different_pairs(piece_names)

# Save to CSV
generate_csv(pairs)


In [3]:
import pandas as pd

# Load the CSV files
csv1 = pd.read_csv('classical_pieces.csv')
csv2 = pd.read_csv('different_classical_pieces.csv')

# Append a new column with value 1 to the first CSV
csv1['new_column'] = 1

# Append a new column with value 0 to the second CSV
csv2['new_column'] = 0

# Concatenate the DataFrames
merged_csv = pd.concat([csv1, csv2], ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_csv.to_csv('path_to_merged_csv.csv', index=False)

In [4]:
df = merged_csv

In [7]:
import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example
from sklearn.metrics import classification_report
from tqdm import tqdm
import random
import warnings

warnings.filterwarnings("ignore")

random.seed(42)

df['combined_text'] = df['Piece Name 1'] + " ||| " + df['Piece Name 2']

train_df, valid_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['new_column']
)

def create_training_data(dataframe):
    training_data = []
    for _, row in dataframe.iterrows():
        text = row['combined_text']
        label = 'SAME' if row['new_column'] == 1 else 'DIFFERENT'
        cats = {'SAME': label == 'SAME', 'DIFFERENT': label == 'DIFFERENT'}
        training_data.append((text, cats))
    return training_data

train_data = create_training_data(train_df)
valid_data = create_training_data(valid_df)

nlp = spacy.blank("en")

if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

textcat.add_label("SAME")
textcat.add_label("DIFFERENT")

other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):
    optimizer = nlp.begin_training()
    n_iter = 10

    print("Starting training...")
    for epoch in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        progress = tqdm(batches, total=max(1, len(train_data) // 4 + 1), desc=f"Epoch {epoch+1}/{n_iter}")
        for batch in progress:
            texts, annotations = zip(*batch)
            examples = []
            for text, cats in zip(texts, annotations):
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, {"cats": cats})
                examples.append(example)
            nlp.update(
                examples,
                sgd=optimizer,
                drop=0.2,
                losses=losses
            )
            progress.set_postfix(loss=losses.get('textcat', 0.0))
        print(f"Epoch {epoch +1} completed. Loss: {losses['textcat']:.4f}")

print("\nEvaluating the model on the validation set...")

valid_texts = [text for text, _ in valid_data]
true_labels = [1 if cats['SAME'] else 0 for _, cats in valid_data]

docs = list(nlp.pipe(valid_texts))
pred_labels = []
for doc in docs:
    if doc.cats['SAME'] > doc.cats['DIFFERENT']:
        pred_labels.append(1)
    else:
        pred_labels.append(0)

report = classification_report(true_labels, pred_labels, target_names=['Different', 'Same'])
print(report)

model_path = "classical_piece_classifier"

nlp.to_disk(model_path)
print(f"Model saved to {model_path}")

def predict_same_piece(piece1, piece2, nlp_model):
    combined_text = piece1 + " ||| " + piece2
    doc = nlp_model(combined_text)
    return 1 if doc.cats['SAME'] > doc.cats['DIFFERENT'] else 0


Starting training...


Epoch 1/10:  14%|█▉            | 11238/80001 [06:41<40:55, 28.01it/s, loss=44.4]


Epoch 1 completed. Loss: 44.4298


Epoch 2/10:  14%|█▌         | 11238/80001 [06:39<40:46, 28.11it/s, loss=4.27e-8]


Epoch 2 completed. Loss: 0.0000


Epoch 3/10:  14%|█▊           | 11238/80001 [07:01<42:57, 26.68it/s, loss=0.914]


Epoch 3 completed. Loss: 0.9142


Epoch 4/10:  14%|█▊           | 11238/80001 [06:40<40:51, 28.05it/s, loss=0.317]


Epoch 4 completed. Loss: 0.3175


Epoch 5/10:  14%|█▍        | 11238/80001 [07:05<43:21, 26.43it/s, loss=3.45e-12]


Epoch 5 completed. Loss: 0.0000


Epoch 6/10:  14%|█▍        | 11238/80001 [06:59<42:47, 26.78it/s, loss=1.24e-15]


Epoch 6 completed. Loss: 0.0000


Epoch 7/10:  14%|█▍        | 11238/80001 [07:07<43:36, 26.28it/s, loss=4.96e-18]


Epoch 7 completed. Loss: 0.0000


Epoch 8/10:  14%|█▍        | 11238/80001 [07:46<47:32, 24.11it/s, loss=2.81e-16]


Epoch 8 completed. Loss: 0.0000


Epoch 9/10:  14%|█▊           | 11238/80001 [08:25<51:30, 22.25it/s, loss=0.357]


Epoch 9 completed. Loss: 0.3570


Epoch 10/10:  14%|█▌         | 11238/80001 [08:05<49:30, 23.15it/s, loss=0.0013]


Epoch 10 completed. Loss: 0.0013

Evaluating the model on the validation set...
              precision    recall  f1-score   support

   Different       1.00      1.00      1.00     40000
        Same       1.00      1.00      1.00     40000

    accuracy                           1.00     80000
   macro avg       1.00      1.00      1.00     80000
weighted avg       1.00      1.00      1.00     80000

Model saved to classical_piece_classifier
