In [None]:
import os
import librosa
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler

from joblib import dump, load


import warnings
import sys
import contextlib
from ast import literal_eval

from imblearn.over_sampling import SMOTE




In [None]:
def list_files_with_paths(base_path):
    file_paths = {}
    for year_folder in os.listdir(base_path):
        year_path = os.path.join(base_path, year_folder)
        if os.path.isdir(year_path):
            for file_name in os.listdir(year_path):
                if file_name.endswith('.mp3'):
                    full_path = os.path.join(year_path, file_name)
                    if file_name not in file_paths:
                        file_paths[file_name] = []
                    file_paths[file_name].append(full_path)
    return file_paths

fifa_base_path = '/Users/vidipkhattar/Developer/YT_FIFA_soundtrack_fetch/fifa_music'
non_fifa_base_path = '/Users/vidipkhattar/Developer/YT_FIFA_soundtrack_fetch/non_fifa_music'

# List all filenames and their full paths in the FIFA and non-FIFA directories
fifa_files_with_paths = list_files_with_paths(fifa_base_path)
non_fifa_files_with_paths = list_files_with_paths(non_fifa_base_path)

# Find common filenames
common_files = set(fifa_files_with_paths.keys()).intersection(set(non_fifa_files_with_paths.keys()))

if common_files:
    print(f"Common files found: {len(common_files)}")
    for file in common_files:
        print(f"\nFile: {file}")
        print("FIFA paths:")
        for path in fifa_files_with_paths[file]:
            print(f" - {path}")
        print("Non-FIFA paths:")
        for path in non_fifa_files_with_paths[file]:
            print(f" - {path}")
else:
    print("No common files found.")


In [None]:
warnings.filterwarnings("ignore")

@contextlib.contextmanager
def suppress_stderr():
    with open(os.devnull, 'w') as devnull:
        old_stderr = sys.stderr
        sys.stderr = devnull
        try:
            yield
        finally:
            sys.stderr = old_stderr

In [None]:
def preprocess_audio(file_path, target_sr=22050):
    with suppress_stderr():
        y, sr = librosa.load(file_path, sr=target_sr)
    return y, sr

In [None]:
"""

def extract_features(file_path):
    features = []
    # Extract MFCCs
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=13)
    mfccs_mean = np.mean(mfccs.T, axis=0)
    features.extend(mfccs_mean)
    # Extract Chroma
    chroma = librosa.feature.chroma_stft(y=y, sr=sr)
    chroma_mean = np.mean(chroma.T, axis=0)
    features.extend(chroma_mean)
    # Extract Spectral Contrast
    spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr)
    spectral_contrast_mean = np.mean(spectral_contrast.T, axis=0)
    features.extend(spectral_contrast_mean)
    return np.array(features)
    
"""


def extract_features(file_path):
    y, sr = preprocess_audio(file_path)
    features = {
        'mfcc_mean': np.mean(librosa.feature.mfcc(y=y, sr=sr), axis=1),
        'chroma_mean': np.mean(librosa.feature.chroma_stft(y=y, sr=sr), axis=1),
        'spectral_contrast_mean': np.mean(librosa.feature.spectral_contrast(y=y, sr=sr), axis=1),
        'mfcc': librosa.feature.mfcc(y=y, sr=sr),
        'chroma': librosa.feature.chroma_stft(y=y, sr=sr),
        'spectral_contrast': librosa.feature.spectral_contrast(y=y, sr=sr),
        'tempo': librosa.beat.tempo(y=y, sr=sr)[0]
    }
    return features


In [None]:
# Paths to your audio folders
fifa_base_path = '/Users/vidipkhattar/Developer/YT_FIFA_soundtrack_fetch/fifa_music/'
non_fifa_base_path = '/Users/vidipkhattar/Developer/YT_FIFA_soundtrack_fetch/non_fifa_music'

def collect_features(base_path, label):
    data = []
    total_files = sum([len(files) for r, d, files in os.walk(base_path)])
    file_count = 0
    for year_folder in os.listdir(base_path):
        year_path = os.path.join(base_path, year_folder)
        if os.path.isdir(year_path):
            for file_name in os.listdir(year_path):
                file_path = os.path.join(year_path, file_name)
                print(year_folder +"/"+file_name)
                if file_name.endswith('.mp3'):
                    try:
                        features = extract_features(file_path)
                        data.append(features)
                        file_count += 1
                        print(f"Processed {file_count}/{total_files} files")
                    except:
                        continue
    
    df = pd.DataFrame(data)
    df['label'] = label
    return df

# Collect features from FIFA and non-FIFA songs
print(fifa_base_path)
fifa_data = collect_features(fifa_base_path, 1)
print(fifa_data)


fifa_data.to_csv('fifa_data.csv')


non_fifa_data = collect_features(non_fifa_base_path, 0)

non_fifa_data.to_csv('non_fifa_data.csv')







In [None]:
fifa_df = pd.read_csv('fifa_data.csv')
non_fifa_df = pd.read_csv('non_fifa_data.csv')
 
#Combine the DataFrames
df = pd.concat([fifa_df, non_fifa_df], ignore_index=True)

def string_to_array(string):
    string = string.strip('[]')
    array = np.array(list(map(float, string.split())))
    return array

columns_to_convert = ['mfcc_mean', 'chroma_mean', 'spectral_contrast_mean']

#columns_to_convert_1 = ['mfcc_mean', 'chroma_mean', 'spectral_contrast_mean']

for col in columns_to_convert:
    print(col)
    df[col] = df[col].apply(lambda x: string_to_array(x))
    
    

    
for col in columns_to_convert:
    print(f"{col} type after conversion: {type(df[col].iloc[0])}")
    
print(df.head())
    
columns_to_drop = ['mfcc', 'chroma', 'spectral_contrast']
df = df.drop(columns=columns_to_drop)

flattened_dfs = [df[col].apply(pd.Series) for col in columns_to_convert]

# Rename columns to include original column name as prefix
for i, col in enumerate(columns_to_convert):
    flattened_dfs[i].columns = [f'{col}_{j}' for j in range(flattened_dfs[i].shape[1])]

# Combine all flattened columns into a single DataFrame
flattened_df = pd.concat([df.drop(columns_to_convert, axis=1)] + flattened_dfs, axis=1)

# Check the resulting DataFrame
print(flattened_df.dtypes)

# Separate features and labels
X = flattened_df.drop(['Unnamed: 0', 'label'], axis=1).values
y = flattened_df['label'].values

print(X)
print(y)


In [None]:
song_path = '/Users/vidipkhattar/Developer/FifaSoundtrack-classifier'

names = []

def collect_features_2(base_path):
    data = []
    for file_name in os.listdir(base_path):
        if file_name.endswith('.mp3'):
            try:
                features = extract_features(base_path+"/"+file_name)
                data.append(features)
                print(f"Processed {file_name}")
                names.append(file_name)
            except:
                print("ERRROROROOROROR")
                continue
    
    df = pd.DataFrame(data)
    return df

def string_to_array(string):
    string = string.strip('[]')
    array = np.array(list(map(float, string.split())))
    return array

df = collect_features_2(song_path)




columns_to_convert = ['mfcc_mean', 'chroma_mean', 'spectral_contrast_mean']

columns_to_drop = ['mfcc', 'chroma', 'spectral_contrast']
df = df.drop(columns=columns_to_drop)

print(df)
print(df.dtypes)

flattened_dfs = [df[col].apply(pd.Series) for col in columns_to_convert]

# Rename columns to include original column name as prefix
for i, col in enumerate(columns_to_convert):
    flattened_dfs[i].columns = [f'{col}_{j}' for j in range(flattened_dfs[i].shape[1])]

# Combine all flattened columns into a single DataFrame
flattened_df = pd.concat([df.drop(columns_to_convert, axis=1)] + flattened_dfs, axis=1)




In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model_path = 'random_forest_model.joblib'
scaler_path = 'scaler.joblib'

# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Apply SMOTE to the training data
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train_scaled, y_train)

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_smote, y_train_smote)

dump(model, model_path)
dump(scaler, scaler_path)

print(f'Accuracy: {accuracy}')
print(f'Classification Report:\n{report}')


In [None]:
# Separate features and labels

X_scaled = scaler.fit_transform(flattened_df)

y_pred = model.predict(X_scaled)
print(y_pred)

# Get the predicted probabilities
predicted_probabilities = model.predict_proba(X_scaled)

# Assuming the class '1' corresponds to FIFA song
predicted_probability_class_1 = predicted_probabilities[:, 1]

# Output the results
for i, probability in enumerate(predicted_probability_class_1):
    print(" ")
    print(names[i])
    print(f"Predicted Probability for song {i+1} being a FIFA song: {probability * 100:.2f}%")

# You can also use a threshold to convert probabilities into binary predictions
threshold = 0.5
predicted_class = (predicted_probability_class_1 > threshold).astype(int)

# Output the predicted classes
print(f"Predicted Classes: {predicted_class}")