In [None]:
import os
import json
import matplotlib.pyplot as plt
from collections import defaultdict, Counter


In [None]:
def aggregate_genre_data(json_dir):
    genre_counter = Counter()

    for json_file in os.listdir(json_dir):
        if json_file.endswith('.json'):
            file_path = os.path.join(json_dir, json_file)
            with open(file_path, 'r') as file:
                data = json.load(file)
                genres = data.get('spotify_metadata', {}).get('genre', [])
                genre_counter.update(genres)
    
    return genre_counter

json_output_dir = 'archive-new/json'  # Update with your directory path
genre_data = aggregate_genre_data(json_output_dir)


In [None]:
def plot_genre_distribution(genre_data):
    genres = list(genre_data.keys())
    song_counts = list(genre_data.values())

    plt.figure(figsize=(12, 6))
    plt.bar(genres, song_counts, color='skyblue')
    plt.xlabel('Genres')
    plt.ylabel('Number of Songs')
    plt.title('Distribution of Genres Across Songs')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

plot_genre_distribution(genre_data)


In [None]:
import pandas as pd
import json
from collections import Counter
from pathlib import Path

# Load JSON files into a DataFrame
json_files = Path('archive-new/json').glob('*.json')
data = []

for file in json_files:
    with open(file, 'r', encoding='utf-8') as f:
        data.append(json.load(f))

df = pd.json_normalize(data)

# Analysis of genres
genre_counts = Counter()
for genres in df['spotify_metadata.genre']:
    genre_counts.update(genres)

print("Genre Counts:\n", genre_counts)

# Average Spotify metadata by genre
genre_analysis = df.explode('spotify_metadata.genre').groupby('spotify_metadata.genre').mean()
print("Genre Analysis:\n", genre_analysis)


In [None]:
from textblob import TextBlob

def analyze_sentiment(lyric_list):
    if not lyric_list:
        return None
    lyrics = ' '.join(lyric_list)
    return TextBlob(lyrics).sentiment.polarity

# Applying sentiment analysis on each part of the lyrics
for part in ['Verse', 'Chorus', 'Pre-Chorus', 'Bridge']:
    df[f'{part}_sentiment'] = df[f'lyrics.{part}'].apply(analyze_sentiment)

print(df[['song_title', 'Verse_sentiment', 'Chorus_sentiment']])


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Correlation matrix of Spotify metadata
spotify_features = df[[f'spotify_metadata.{feature}' for feature in ['energy', 'danceability', 'loudness', 'tempo', 'valence']]]
corr_matrix = spotify_features.corr()

# Plotting
sns.heatmap(corr_matrix, annot=True)
plt.show()


In [None]:
import json
import os

# Directory containing JSON files
json_files_dir = 'archive-new/json'

# List to store MIDI file paths
midi_file_paths = []

# Iterate over all JSON files in the directory
for file in os.listdir(json_files_dir):
    if file.endswith('.json'):
        with open(os.path.join(json_files_dir, file), 'r') as json_file:
            data = json.load(json_file)
            midi_file_path = data.get('midi_file_path')
            if midi_file_path:
                midi_file_paths.append(midi_file_path)


In [None]:
from music21 import converter, instrument, note, chord, tempo, key
from tqdm import tqdm

def extract_midi_features(midi_file):
    try:
        midi_data = converter.parse(midi_file)
    except Exception as e:
        return {'error': str(e)}

    # Extract key
    key_signature = midi_data.analyze('key')
    
    # Extract tempo
    try:
        midi_tempo = midi_data.metronomeMarkBoundaries()[0][2].number
    except IndexError:
        midi_tempo = None

    # Extract instruments
    instruments = []
    parts = instrument.partitionByInstrument(midi_data)
    if parts:  # Check if parts is not None
        for part in parts:
            if part:  # if not None
                instruments.append(str(part.getInstrument()))

    # Extract notes and chords
    notes = []
    chords = []
    for element in midi_data.recurse():
        if isinstance(element, note.Note):
            notes.append(element.pitch)
        elif isinstance(element, chord.Chord):
            chords.append('.'.join(str(n) for n in element.normalOrder))

    return {
        'key': str(key_signature),
        'tempo': midi_tempo,
        'instruments': instruments,
        'notes': [str(n) for n in notes],
        'chords': chords
    }

# Extract features from each MIDI file
midi_features = [extract_midi_features(file) for file in tqdm(midi_file_paths, desc='Processing MIDI files')]

# Convert to DataFrame
midi_df = pd.DataFrame(midi_features)


In [None]:
import os
import json
from music21 import converter, instrument, note, chord, key
from tqdm import tqdm

def extract_midi_features(midi_file):
    try:
        midi_data = converter.parse(midi_file)
    except Exception as e:
        return {'error': str(e), 'file': midi_file}

    # Extract key
    key_signature = midi_data.analyze('key')
    
    # Extract tempo
    try:
        midi_tempo = midi_data.metronomeMarkBoundaries()[0][2].number
    except IndexError:
        midi_tempo = None

    # Extract instruments
    instruments = []
    parts = instrument.partitionByInstrument(midi_data)
    if parts:  # Check if parts is not None
        for part in parts:
            if part:  # if not None
                instruments.append(str(part.getInstrument()))

    # Extract notes and chords
    notes = []
    chords = []
    for element in midi_data.recurse():
        if isinstance(element, note.Note):
            notes.append(str(element.pitch))
        elif isinstance(element, chord.Chord):
            chords.append('.'.join(str(n) for n in element.normalOrder))

    return {
        'file': midi_file,
        'key': str(key_signature),
        'tempo': midi_tempo,
        'instruments': instruments,
        'notes': notes,
        'chords': chords
    }

def save_to_json(data, file_path):
    with open(file_path, 'w') as f:
        json.dump(data, f, indent=4)

def load_processed_files(file_path):
    if os.path.exists(file_path):
        with open(file_path, 'r') as f:
            return json.load(f)
    else:
        return {}

# # Define your MIDI file paths
# midi_file_paths = ['path_to_your_midi_files']  # Replace with your actual list of MIDI file paths

# Load previously processed files if available
processed_files = load_processed_files('processed_files.json')

# Ensure processed_files is a dictionary
if not isinstance(processed_files, dict):
    processed_files = {}

# Process each MIDI file
for midi_file in tqdm(midi_file_paths, desc='Processing MIDI files'):
    if midi_file not in processed_files:
        features = extract_midi_features(midi_file)
        processed_files[midi_file] = features
        save_to_json(processed_files, 'processed_files.json')  # Save after each file



In [None]:
import pandas as pd
midi_df = pd.DataFrame(list(processed_files.values()))

In [None]:
midi_df.columns

In [None]:
midi_df.head(10)

In [None]:
import json
from music21 import converter, instrument, note, chord, tempo, key
from tqdm import tqdm
import os

processed_files_path = 'processed_files.json'  # File to keep track of processed files

# Function to extract MIDI features
def extract_midi_features(midi_file):
    try:
        midi_data = converter.parse(midi_file)
    except Exception as e:
        return {'error': str(e)}

    # Extract key
    key_signature = midi_data.analyze('key')
    
    # Extract tempo
    try:
        midi_tempo = midi_data.metronomeMarkBoundaries()[0][2].number
    except IndexError:
        midi_tempo = None

    # Extract instruments
    instruments = []
    parts = instrument.partitionByInstrument(midi_data)
    if parts:  # Check if parts is not None
        for part in parts:
            if part:  # if not None
                instruments.append(str(part.getInstrument()))

    # Extract notes and chords
    notes = []
    chords = []
    for element in midi_data.recurse():
        if isinstance(element, note.Note):
            notes.append(element.pitch)
        elif isinstance(element, chord.Chord):
            chords.append('.'.join(str(n) for n in element.normalOrder))

    return {
        'key': str(key_signature),
        'tempo': midi_tempo,
        'instruments': instruments,
        'notes': [str(n) for n in notes],
        'chords': chords
    }

# Load already processed files if the file exists
if os.path.exists(processed_files_path):
    with open(processed_files_path, 'r') as file:
        processed_files = json.load(file)
else:
    processed_files = []

# Process MIDI files
for file in tqdm(midi_file_paths, desc='Processing MIDI files'):
    if file not in processed_files:
        midi_features.append(extract_midi_features(file))
        processed_files.append(file)
        # Save the processed files list to a file periodically
        with open(processed_files_path, 'w') as file:
            json.dump(processed_files, file)

# Convert to DataFrame
midi_df = pd.DataFrame(midi_features)


In [None]:
# Function to extract nested data
def extract_nested_data(df, column):
    return df[column].apply(pd.Series)

# Extracting MIDI features and lyrics
df_midi_features = extract_nested_data(df, 'midi_features')
df_lyrics = extract_nested_data(df, 'lyrics')

# Joining extracted data back to the main DataFrame
df = df.join(df_midi_features).join(df_lyrics)


In [None]:
df.head()

In [None]:
from textblob import TextBlob

def analyze_sentiment(lyrics):
    if lyrics and isinstance(lyrics, str):
        return TextBlob(lyrics).sentiment.polarity
    return None

# Applying sentiment analysis
for part in ['Verse 1',	'Pre-Chorus 1',	'Chorus 1',	'Verse 2',	'Pre-Chorus 2',	'Bridge',	'Pre-Chorus 3', 'Chorus 2', 'Verse 3',	'Verse 4']:  # Add other parts as needed
    df[f'{part}_sentiment'] = df[part].apply(analyze_sentiment)


In [None]:
df.head(10)

In [None]:
# Count the frequency of each instrument in the dataset
instrument_counts = df['instruments'].explode().value_counts()

# Plot the result
instrument_counts.plot(kind='bar')
plt.title('Frequency of Instruments')
plt.show()
