In [1]:
import csv
import random

def generate_fictional_piece_names(num_pieces=100000):
    genres = ["Concerto", "Symphony", "Sonata", "Suite", "Overture", "Opera", "Quartet", "Trio", "Cantata", "Etude", "Prelude", "Rhapsody", "Serenade", "Nocturne"]
    instruments = ["Piano", "Violin", "Cello", "Flute", "Clarinet", "Harp", "Guitar", "Oboe", "Bassoon", "Horn", "Trumpet", "Trombone"]
    keys = ["C Major", "G Major", "D Major", "A Major", "E Major", "B Major", "F Major", "B-Flat Major", "E-Flat Major", "A-Flat Major", "D-Flat Major", "G-Flat Major", 
            "C Minor", "G Minor", "D Minor", "A Minor", "E Minor", "B Minor", "F Minor", "B-Flat Minor", "E-Flat Minor", "A-Flat Minor", "D-Flat Minor", "G-Flat Minor"]
    opus_numbers = [f"Op. {i}" for i in range(1, 201)]
    k_numbers = [f"K. {i}" for i in range(1, 501)]
    
    piece_names = []
    
    for _ in range(num_pieces):
        genre = random.choice(genres)
        instrument = random.choice(instruments)
        number = random.randint(1, 30)
        key = random.choice(keys)
        opus = random.choice(opus_numbers)
        k_number = random.choice(k_numbers)
        
        piece_name = f"{instrument} {genre} No. {number} in {key}, {opus}, {k_number}"
        piece_names.append(piece_name)
    
    return piece_names

def generate_variations(piece_name):
    parts = piece_name.split(", ")
    title = parts[0]
    details = parts[1:] if len(parts) > 1 else []
    
    variations = set()
    variations.add(piece_name)  # Original full name

    if len(details) == 2:
        # Drop either opus or K number, not both
        variations.add(f"{title}, {details[1]}")  # Drop opus number
        variations.add(f"{title}, {details[0]}")  # Drop K number
        variations.add(f"{title} ({details[0]})")  # Title with opus number in brackets
        variations.add(f"{details[0]}, {title}")  # Opus number first, then title
        variations.add(f"{details[1]}, {title}")  # K number first, then title
        variations.add(f"{title}")  # Only title
    elif len(details) == 1:
        variations.add(f"{details[0]}, {title}")  # Reorder
        variations.add(f"{title}")  # Only title
        variations.add(f"{details[0]}")  # Only details part

    return list(variations)

def generate_csv(piece_names, filename="classical_pieces.csv"):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Piece Name 1", "Piece Name 2"])
        count = 0
        while count < 200000:
            piece_name = random.choice(piece_names)
            variations = generate_variations(piece_name)
            if len(variations) > 1:
                piece_pair = random.sample(variations, 2)
                # Ensure neither of the names is just an opus or K number alone
                if not (piece_pair[0].startswith("Op.") or piece_pair[0].startswith("K.")) and not (piece_pair[1].startswith("Op.") or piece_pair[1].startswith("K.")):
                    writer.writerow(piece_pair)
                    count += 1

piece_names = generate_fictional_piece_names()
generate_csv(piece_names)


In [2]:
import csv
import random

def generate_fictional_piece_names(num_pieces=100000):
    genres = ["Concerto", "Symphony", "Sonata", "Suite", "Overture", "Opera", "Quartet", "Trio", "Cantata", "Etude", "Prelude", "Rhapsody", "Serenade", "Nocturne"]
    instruments = ["Piano", "Violin", "Cello", "Flute", "Clarinet", "Harp", "Guitar", "Oboe", "Bassoon", "Horn", "Trumpet", "Trombone"]
    keys = ["C Major", "G Major", "D Major", "A Major", "E Major", "B Major", "F Major", "B-Flat Major", "E-Flat Major", "A-Flat Major", "D-Flat Major", "G-Flat Major", 
            "C Minor", "G Minor", "D Minor", "A Minor", "E Minor", "B Minor", "F Minor", "B-Flat Minor", "E-Flat Minor", "A-Flat Minor", "D-Flat Minor", "G-Flat Minor"]
    opus_numbers = [f"Op. {i}" for i in range(1, 201)]
    k_numbers = [f"K. {i}" for i in range(1, 501)]
    
    piece_names = []
    
    for _ in range(num_pieces):
        genre = random.choice(genres)
        instrument = random.choice(instruments)
        number = random.randint(1, 30)
        key = random.choice(keys)
        opus = random.choice(opus_numbers)
        k_number = random.choice(k_numbers)
        
        piece_name = f"{instrument} {genre} No. {number} in {key}, {opus}, {k_number}"
        piece_names.append(piece_name)
    
    return piece_names

def generate_similar_but_different_pairs(piece_names, num_pairs=200000):
    keys = ["C Major", "G Major", "D Major", "A Major", "E Major", "B Major", "F Major", "B-Flat Major", "E-Flat Major", "A-Flat Major", "D-Flat Major", "G-Flat Major", 
            "C Minor", "G Minor", "D Minor", "A Minor", "E Minor", "B Minor", "F Minor", "B-Flat Minor", "E-Flat Minor", "A-Flat Minor", "D-Flat Minor", "G-Flat Minor"]
    opus_numbers = [f"Op. {i}" for i in range(1, 201)]
    k_numbers = [f"K. {i}" for i in range(1, 501)]
    
    pairs = []
    for _ in range(num_pairs):
        piece_name = random.choice(piece_names)
        
        # Split the piece name into components
        parts = piece_name.split(", ")
        title_parts = parts[0].split(" ")
        
        # Ensure changing the number after the genre
        number_index = title_parts.index("No.") + 1  # Index of the number after "No."
        new_number = random.randint(1, 30)
        while new_number == int(title_parts[number_index]):
            new_number = random.randint(1, 30)
        title_parts[number_index] = str(new_number)

        # Optionally, make additional changes to ensure diversity
        change_attribute = random.choice(["key", "opus", "k_number"])
        
        if change_attribute == "key":
            key_index = title_parts.index("in") + 1  # Index of the key after "in"
            new_key = random.choice(keys)
            while new_key == " ".join(title_parts[key_index:key_index + 2]):
                new_key = random.choice(keys)
            title_parts[key_index:key_index + 2] = new_key.split()
        
        elif change_attribute == "opus":
            new_opus = random.choice(opus_numbers)
            while new_opus == parts[1]:
                new_opus = random.choice(opus_numbers)
            parts[1] = new_opus
        
        elif change_attribute == "k_number":
            new_k_number = random.choice(k_numbers)
            while new_k_number == parts[2]:
                new_k_number = random.choice(k_numbers)
            parts[2] = new_k_number
        
        # Reconstruct the similar but different piece name
        similar_piece_name = " ".join(title_parts)
        similar_piece_full_name = f"{similar_piece_name}, {parts[1]}, {parts[2]}"
        
        pairs.append((piece_name, similar_piece_full_name))
    
    return pairs

def generate_csv(pairs, filename="different_classical_pieces.csv"):
    with open(filename, mode='w', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(["Piece Name 1", "Piece Name 2"])
        writer.writerows(pairs)

# Generate the piece names
piece_names = generate_fictional_piece_names()

# Generate the pairs of similar but different pieces
pairs = generate_similar_but_different_pairs(piece_names)

# Save to CSV
generate_csv(pairs)


In [3]:
import pandas as pd

# Load the CSV files
csv1 = pd.read_csv('classical_pieces.csv')
csv2 = pd.read_csv('different_classical_pieces.csv')

# Append a new column with value 1 to the first CSV
csv1['new_column'] = 1

# Append a new column with value 0 to the second CSV
csv2['new_column'] = 0

# Concatenate the DataFrames
merged_csv = pd.concat([csv1, csv2], ignore_index=True)

# Save the merged DataFrame to a new CSV file
merged_csv.to_csv('path_to_merged_csv.csv', index=False)

In [17]:
df = merged_csv

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import pandas as pd

# Combine both columns of strings into a single dataframe
X = df[['Piece Name 1', 'Piece Name 2']]
y = df['new_column']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_combined = X_train['Piece Name 1'] + " " + X_train['Piece Name 2']
X_train_tfidf = vectorizer.fit_transform(X_train_combined)

# Transform the test data
X_test_combined = X_test['Piece Name 1'] + " " + X_test['Piece Name 2']
X_test_tfidf = vectorizer.transform(X_test_combined)


In [7]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
classifier.fit(X_train_tfidf, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_tfidf)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


Accuracy: 0.9931
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.99      0.99     39949
           1       0.99      1.00      0.99     40051

    accuracy                           0.99     80000
   macro avg       0.99      0.99      0.99     80000
weighted avg       0.99      0.99      0.99     80000



In [9]:
from gensim.models import Word2Vec
import nltk
from nltk.tokenize import word_tokenize

# Combine all text data for training Word2Vec
all_text = pd.concat([df['Piece Name 1'], df['Piece Name 2']])

# Tokenize the text
nltk.download('punkt')
tokenized_text = [word_tokenize(text.lower()) for text in all_text]

# Train the Word2Vec model
word2vec_model = Word2Vec(sentences=tokenized_text, vector_size=100, window=5, min_count=1, workers=4)

# Save the model for later use
word2vec_model.save("word2vec_model.model")


[nltk_data] Downloading package punkt to /Users/seangong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [10]:
import numpy as np

def get_sentence_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Apply the function to both columns
X1 = np.array([get_sentence_vector(text, word2vec_model) for text in df['Piece Name 1']])
X2 = np.array([get_sentence_vector(text, word2vec_model) for text in df['Piece Name 2']])

# Combine the vectors (e.g., concatenation, difference, or other methods)
X = np.concatenate([X1, X2], axis=1)

y = df['new_column']


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the base classifier
base_classifier = DecisionTreeClassifier(max_depth=1)

# Initialize the AdaBoost classifier
adaboost_classifier = AdaBoostClassifier(base_estimator=base_classifier, n_estimators=100, random_state=42)

# Train the classifier
adaboost_classifier.fit(X_train, y_train)

# Predict on the test set
y_pred = adaboost_classifier.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")
print(f"Confusion Matrix:\n{conf_matrix}")




Accuracy: 1.0
Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     39949
           1       1.00      1.00      1.00     40051

    accuracy                           1.00     80000
   macro avg       1.00      1.00      1.00     80000
weighted avg       1.00      1.00      1.00     80000

Confusion Matrix:
[[39949     0]
 [    0 40051]]


In [24]:
def get_sentence_vector(sentence, model):
    words = word_tokenize(sentence.lower())
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if word_vectors:
        return np.mean(word_vectors, axis=0)
    else:
        return np.zeros(model.vector_size)

# Example strings to predict
string1 = "Piano Concerto 1"
string2 = ""

# Convert the strings into vectors
vector1 = get_sentence_vector(string1, word2vec_model)
vector2 = get_sentence_vector(string2, word2vec_model)

# Combine the vectors (same method as used during training)
combined_vector = np.concatenate([vector1, vector2])

# Predict the label
prediction = adaboost_classifier.predict([combined_vector])

print(f"Prediction: {prediction[0]}")

Prediction: 1


In [9]:
df

Unnamed: 0,Piece Name 1,Piece Name 2,new_column
0,"Horn Overture No. 28 in A-Flat Major, Op. 146,...","Horn Overture No. 28 in A-Flat Major, Op. 146",1
1,Clarinet Quartet No. 15 in G-Flat Major,"Clarinet Quartet No. 15 in G-Flat Major, K. 179",1
2,"Flute Sonata No. 18 in D-Flat Minor, Op. 103","Flute Sonata No. 18 in D-Flat Minor, Op. 103, ...",1
3,"Horn Rhapsody No. 4 in G Major, Op. 105","Horn Rhapsody No. 4 in G Major, K. 117",1
4,Horn Quartet No. 20 in D-Flat Major (Op. 56),Horn Quartet No. 20 in D-Flat Major,1
...,...,...,...
399995,"Horn Serenade No. 13 in A Minor, Op. 57, K. 388","Horn Serenade No. 15 in A Minor, Op. 177, K. 388",0
399996,"Trombone Nocturne No. 26 in E Minor, Op. 69, K...","Trombone Nocturne No. 5 in B Major, Op. 69, K....",0
399997,"Cello Serenade No. 19 in E Major, Op. 192, K. 364","Cello Serenade No. 23 in E Major, Op. 52, K. 364",0
399998,"Flute Prelude No. 12 in B-Flat Minor, Op. 36, ...","Flute Prelude No. 8 in B-Flat Minor, Op. 175, ...",0


In [8]:
import spacy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pandas as pd
import numpy as np

# Load spaCy's language model
nlp = spacy.load("en_core_web_md")  # Use a medium or large model for better embeddings

# Combine both columns of strings into a single dataframe
X = df[['Piece Name 1', 'Piece Name 2']]
y = df['new_column']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a function to generate spaCy embeddings
def get_embedding(text):
    doc = nlp(text)
    # Get the mean of the token vectors in the document
    return doc.vector

# Combine and vectorize the training data
X_train_combined = (X_train['Piece Name 1'] + " " + X_train['Piece Name 2']).apply(get_embedding)
X_test_combined = (X_test['Piece Name 1'] + " " + X_test['Piece Name 2']).apply(get_embedding)

# Convert the series of embeddings to a numpy array
X_train_vectors = np.array(X_train_combined.tolist())
X_test_vectors = np.array(X_test_combined.tolist())

# Initialize the classifier
classifier = RandomForestClassifier(n_estimators=100, random_state=42)

# Train the classifier
classifier.fit(X_train_vectors, y_train)

# Predict on the test set
y_pred = classifier.predict(X_test_vectors)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Classification Report:\n{report}")


KeyboardInterrupt: 

In [22]:
# Uncomment the following lines to install necessary packages if not already installed
# !pip install spacy==3.5.0
# !pip install pandas
# !pip install scikit-learn
# !pip install tqdm

import pandas as pd
from sklearn.model_selection import train_test_split
import spacy
from spacy.util import minibatch, compounding
from spacy.training import Example
from sklearn.metrics import classification_report
from tqdm import tqdm
import random
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Initialize random seed for reproducibility
random.seed(42)

# --------------------- 1. Data Preparation ---------------------


# Concatenate 'Piece Name 1' and 'Piece Name 2' into a single text input
df['combined_text'] = df['Piece Name 1'] + " ||| " + df['Piece Name 2']

# Split the data into training and validation sets (80% train, 20% validation)
train_df, valid_df = train_test_split(
    df,
    test_size=0.2,
    random_state=42,
    stratify=df['new_column']
)

# --------------------- 2. Creating Training Examples ---------------------

# Function to convert DataFrame rows to spaCy training format
def create_training_data(dataframe):
    training_data = []
    for _, row in dataframe.iterrows():
        text = row['combined_text']
        label = 'SAME' if row['new_column'] == 1 else 'DIFFERENT'
        cats = {'SAME': label == 'SAME', 'DIFFERENT': label == 'DIFFERENT'}
        training_data.append((text, cats))
    return training_data

train_data = create_training_data(train_df)
valid_data = create_training_data(valid_df)

# --------------------- 3. Setting Up the spaCy Pipeline ---------------------

# Load a blank English model
nlp = spacy.blank("en")

# Add the text categorizer to the pipeline
if "textcat" not in nlp.pipe_names:
    textcat = nlp.add_pipe("textcat", last=True)
else:
    textcat = nlp.get_pipe("textcat")

# Add labels to the text categorizer
textcat.add_label("SAME")
textcat.add_label("DIFFERENT")

# --------------------- 4. Training the Model ---------------------

# Disable other pipelines during training to speed up
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "textcat"]
with nlp.disable_pipes(*other_pipes):
    # Initialize the optimizer
    optimizer = nlp.begin_training()
    n_iter = 10  # Number of training iterations (epochs)

    print("Starting training...")
    for epoch in range(n_iter):
        random.shuffle(train_data)
        losses = {}
        # Create minibatches using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4.0, 32.0, 1.001))
        # Initialize progress bar
        progress = tqdm(batches, total=max(1, len(train_data) // 4 + 1), desc=f"Epoch {epoch+1}/{n_iter}")
        for batch in progress:
            texts, annotations = zip(*batch)
            examples = []
            for text, cats in zip(texts, annotations):
                doc = nlp.make_doc(text)
                example = Example.from_dict(doc, {"cats": cats})
                examples.append(example)
            nlp.update(
                examples,            # Batch of Example objects
                sgd=optimizer,      # Optimizer
                drop=0.2,            # Dropout rate
                losses=losses
            )
            progress.set_postfix(loss=losses.get('textcat', 0.0))
        print(f"Epoch {epoch +1} completed. Loss: {losses['textcat']:.4f}")

# --------------------- 5. Evaluating the Model ---------------------

print("\nEvaluating the model on the validation set...")

# Prepare the validation texts and true labels
valid_texts = [text for text, _ in valid_data]
true_labels = [1 if cats['SAME'] else 0 for _, cats in valid_data]

# Get predictions
docs = list(nlp.pipe(valid_texts))
pred_labels = []
for doc in docs:
    if doc.cats['SAME'] > doc.cats['DIFFERENT']:
        pred_labels.append(1)
    else:
        pred_labels.append(0)

# Generate and print classification report
report = classification_report(true_labels, pred_labels, target_names=['Different', 'Same'])
print(report)

# --------------------- 6. Saving the Model ---------------------

# Define the path to save the model
model_path = "classical_piece_classifier"

# Save the trained model to disk
nlp.to_disk(model_path)
print(f"Model saved to {model_path}")

# --------------------- 7. Using the Model for Predictions ---------------------

# Load the saved model (optional, if you want to load later)
# nlp = spacy.load(model_path)

def predict_same_piece(piece1, piece2, nlp_model):
    """
    Predict whether two piece names refer to the same classical piece.

    Args:
        piece1 (str): Name of the first piece.
        piece2 (str): Name of the second piece.
        nlp_model (spacy.Language): The trained spaCy model.

    Returns:
        int: 1 if the same piece, 0 otherwise.
    """
    combined_text = piece1 + " ||| " + piece2
    doc = nlp_model(combined_text)
    return 1 if doc.cats['SAME'] > doc.cats['DIFFERENT'] else 0




Starting training...


Epoch 1/10:   0%|               | 226/80001 [00:01<10:49, 122.82it/s, loss=23.9]


KeyboardInterrupt: 

In [16]:
# Example usage
example_piece_a = "Horn Overture No. 2123123238724364181264 jor, Op. 146"
example_piece_b = "Horn Overture No. 2123123123234 in A-Flat Major, Op. 146"

result = predict_same_piece(example_piece_a, example_piece_b, nlp)
print("\nPrediction for the example:")
print(f"Pieces are the same: {'Yes' if result == 1 else 'No'}")


Prediction for the example:
Pieces are the same: Yes
