In [None]:
'''
# Select the first 100 rows
selected_rows = selected_columns.head(100)

# Specify the path for the new CSV file
new_file_path = 'trackfile.csv'

# Save the selected rows to a new CSV file
selected_rows.to_csv(new_file_path, index=False)

print("Selected data saved to", new_file_path)

# Read the CSV file into a DataFrame
data = pd.read_csv('trackfile.csv')

df = data.loc[:, ['track_id','title','genre_top','tag','listens']]
df
'''

In [None]:
#pip install librosa

In [11]:
import pandas as pd
import os
import librosa
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from pymongo import MongoClient

In [30]:
# Read the CSV file into a DataFrame
#data = pd.read_csv('tracks.csv', header=0)  # trackfile.csv
# Specify the CSV file path
csv_path = 'tracks.csv'

In [31]:
def load_and_clean_data(csv_path):
    df = pd.read_csv(csv_path, low_memory=False)
    df['track_id'] = pd.to_numeric(df['track_id'], errors='coerce')
    df = df.dropna(subset=['track_id'])
    df['track_id'] = df['track_id'].astype(int)
    return df
df = load_and_clean_data(csv_path)

In [32]:
df.columns

Index(['track_id', 'comments', 'date_created', 'date_released', 'engineer',
       'favorites', 'id', 'information', 'listens', 'producer',
       ...
       'Unnamed: 94', 'Unnamed: 95', 'Unnamed: 96', 'Unnamed: 97',
       'Unnamed: 98', 'Unnamed: 99', 'Unnamed: 100', 'Unnamed: 101',
       'Unnamed: 102', 'Unnamed: 103'],
      dtype='object', length=104)

In [33]:
df.head()

Unnamed: 0,track_id,comments,date_created,date_released,engineer,favorites,id,information,listens,producer,...,Unnamed: 94,Unnamed: 95,Unnamed: 96,Unnamed: 97,Unnamed: 98,Unnamed: 99,Unnamed: 100,Unnamed: 101,Unnamed: 102,Unnamed: 103
0,2,0,11/26/2008 1:44,1/5/2009 0:00,,4,1,<p></p>,6073,,...,,,,,,,,,,
1,3,0,11/26/2008 1:44,1/5/2009 0:00,,4,1,<p></p>,6073,,...,,,,,,,,,,
2,5,0,11/26/2008 1:44,1/5/2009 0:00,,4,1,<p></p>,6073,,...,,,,,,,,,,
3,10,0,11/26/2008 1:45,2/6/2008 0:00,,4,6,,47632,,...,,,,,,,,,,
4,20,0,11/26/2008 1:45,1/6/2009 0:00,,2,4,"<p> ""spiritual songs"" from Nicky Cook</p>",2710,,...,,,,,,,,,,


In [34]:
# Remove duplicate rows based on specific columns (excluding 'track_id')
df = df.drop_duplicates(subset=['title', 'genre_top', 'tag', 'listens'])

In [35]:
# Remove NaN rows from string-type columns
#string_columns = ['title', 'genre_top', 'tag']
#df[string_columns] = df[string_columns].dropna()

In [36]:
# Remove NaN rows from numeric columns
#numeric_columns = ['track_id', 'listens']
#df[numeric_columns] = df[numeric_columns].dropna()

In [37]:
# Select specific columns by name
df = df.loc[:, ['track_id','title','genre_top','tag','listens','date_created','type','name']]

# Display the selected columns
df.head()

Unnamed: 0,track_id,title,genre_top,tag,listens,date_created,type,name
0,2,AWOL - A Way Of Life,Hip-Hop,['awol'],6073,11/26/2008 1:44,Album,AWOL
3,10,Constant Hitmaker,Pop,"['philly', 'kurt vile']",47632,11/26/2008 1:45,Album,Kurt Vile
4,20,Niris,,"['instrumentals', 'experimental pop', 'post pu...",2710,11/26/2008 1:45,Album,Nicky Cook
10,135,mp3,Rock,['abominog'],3331,11/26/2008 1:49,Single Tracks,Abominog
12,137,Live at LACE,Experimental,['airway'],1681,11/26/2008 1:49,Live Performance,Airway


In [38]:
# Define the path to the directory containing the MP3 files
# parent_folder = 'C:/Users/ZARI/Downloads/Assignments/Big Data Assignments/Big data Project/data'
mp3_dir = '/home/zari/Big Data Project/data'
df = df[['track_id', 'title', 'genre_top', 'tag', 'listens','date_created','type','name']]

In [42]:
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfcc = librosa.feature.mfcc(y=y, sr=sr)
    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
    zero_crossing_rate = librosa.feature.zero_crossing_rate(y)
    return {"mfcc": np.mean(mfcc, axis=1).tolist(), "spectral_centroid": np.mean(spectral_centroid), "zero_crossing_rate": np.mean(zero_crossing_rate)}

def process_files(directory, df):
    client = MongoClient('mongodb://localhost:27017/')
    db = client['audio_features_db1']
    collection = db['features1']

    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith('.mp3'):
                file_path = os.path.join(root, file)
                track_id = int(file.split('.')[0].lstrip('0'))
                metadata = df[df['track_id'] == track_id]
                if not metadata.empty:
                    metadata_dict = metadata.iloc[0].to_dict()
                    features = extract_features(file_path)
                  #  document = {'track_id': track_id, 'features': features}
                    # Create a document for MongoDB
                    document = {
                        'track_id': track_id,
                        'title': metadata['title'].iloc[0],
                        'genre_top': metadata['genre_top'].iloc[0],
                        'date_created': metadata['date_created'].iloc[0],
                        'type': metadata['type'].iloc[0],
                        'name': metadata['name'].iloc[0],
                        'tag': metadata['tag'].iloc[0],
                        'listens': metadata['listens'].iloc[0],
                        'features': features
                    }

                    collection.insert_one(document)
                    print(f"Inserted features for track {track_id}")
                else:
                    print(f"No metadata found for track ID {track_id}")

# Process files
process_files(mp3_dir, df)
print(" ")
print("All features extracted and stored in MongoDB.")

Inserted features for track 2012
Inserted features for track 2125
Inserted features for track 2097
Inserted features for track 2096
Inserted features for track 2099
No metadata found for track ID 136
Inserted features for track 140
Inserted features for track 2
No metadata found for track ID 3
No metadata found for track ID 5
Inserted features for track 10
Inserted features for track 182
Inserted features for track 139
Inserted features for track 181
No metadata found for track ID 141
Inserted features for track 148
No metadata found for track ID 134
No metadata found for track ID 1017
No metadata found for track ID 1015
Inserted features for track 1040
No metadata found for track ID 1023
No metadata found for track ID 1028
No metadata found for track ID 1018
Inserted features for track 1039
No metadata found for track ID 1016
Inserted features for track 1041
Inserted features for track 1014
Inserted features for track 1022
No metadata found for track ID 1029
 
All features extracted a