# GTZAN - Deep Learning

`Music Genre Classification Problem`. Experts have been trying for a long time to understand sound & what differentiates one from another. How to visualize sound. What makes one tone different from another.

We are going to analyze the features extracted from the GTZAN dataset and build different type of ensemble models to see how better we can differentiate one genre from another.

Our Datasets contains 10 genres:-
- Blues
- Classical
- Country
- Disco
- Hiphop
- Jazz
- Metal
- Pop
- Reggae
- Rock


# Reading & Understanding Data
## Importing Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_style('whitegrid')
%matplotlib inline
import warnings
warnings.filterwarnings('ignore')
import sklearn.metrics as skm
import sklearn.model_selection as skms
import sklearn.preprocessing as skp
import random
import librosa, IPython
import librosa.display as lplt
seed = 12
np.random.seed(seed)

### Loading Dataset

In [3]:
df = pd.read_csv('/kaggle/input/gtzan-dataset-music-genre-classification/Data/features_3_sec.csv')
df.head()

Unnamed: 0,filename,length,chroma_stft_mean,chroma_stft_var,rms_mean,rms_var,spectral_centroid_mean,spectral_centroid_var,spectral_bandwidth_mean,spectral_bandwidth_var,...,mfcc16_var,mfcc17_mean,mfcc17_var,mfcc18_mean,mfcc18_var,mfcc19_mean,mfcc19_var,mfcc20_mean,mfcc20_var,label
0,blues.00000.0.wav,66149,0.335406,0.091048,0.130405,0.003521,1773.065032,167541.630869,1972.744388,117335.771563,...,39.687145,-3.24128,36.488243,0.722209,38.099152,-5.050335,33.618073,-0.243027,43.771767,blues
1,blues.00000.1.wav,66149,0.343065,0.086147,0.112699,0.00145,1816.693777,90525.690866,2010.051501,65671.875673,...,64.748276,-6.055294,40.677654,0.159015,51.264091,-2.837699,97.03083,5.784063,59.943081,blues
2,blues.00000.2.wav,66149,0.346815,0.092243,0.132003,0.00462,1788.539719,111407.437613,2084.565132,75124.921716,...,67.336563,-1.76861,28.348579,2.378768,45.717648,-1.938424,53.050835,2.517375,33.105122,blues
3,blues.00000.3.wav,66149,0.363639,0.086856,0.132565,0.002448,1655.289045,111952.284517,1960.039988,82913.639269,...,47.739452,-3.841155,28.337118,1.218588,34.770935,-3.580352,50.836224,3.630866,32.023678,blues
4,blues.00000.4.wav,66149,0.335579,0.088129,0.143289,0.001701,1630.656199,79667.267654,1948.503884,60204.020268,...,30.336359,0.664582,45.880913,1.689446,51.363583,-3.392489,26.738789,0.536961,29.146694,blues


In [14]:
import pandas as pd

# Charger les données
df = pd.read_csv('/kaggle/input/gtzan-dataset-music-genre-classification/Data/features_3_sec.csv')

# Afficher les 5 premières lignes
print(df.head())

# Compter le nombre de chansons
num_songs = len(df)
print(f"Le jeu de données contient {num_songs} chansons.")

            filename  length  chroma_stft_mean  chroma_stft_var  rms_mean  \
0  blues.00000.0.wav   66149          0.335406         0.091048  0.130405   
1  blues.00000.1.wav   66149          0.343065         0.086147  0.112699   
2  blues.00000.2.wav   66149          0.346815         0.092243  0.132003   
3  blues.00000.3.wav   66149          0.363639         0.086856  0.132565   
4  blues.00000.4.wav   66149          0.335579         0.088129  0.143289   

    rms_var  spectral_centroid_mean  spectral_centroid_var  \
0  0.003521             1773.065032          167541.630869   
1  0.001450             1816.693777           90525.690866   
2  0.004620             1788.539719          111407.437613   
3  0.002448             1655.289045          111952.284517   
4  0.001701             1630.656199           79667.267654   

   spectral_bandwidth_mean  spectral_bandwidth_var  ...  mfcc16_var  \
0              1972.744388           117335.771563  ...   39.687145   
1              2010.05

In [13]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from collections import defaultdict

# Charger les données
df = pd.read_csv('/kaggle/input/gtzan-dataset-music-genre-classification/Data/features_3_sec.csv')

# Sélectionner les caractéristiques pertinentes
features = ['chroma_stft_mean', 'rms_mean', 'spectral_centroid_mean', 'spectral_bandwidth_mean']
X = df[features].values

# Effectuer le clustering
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X)

# Regrouper les chansons par cluster
cluster_data = defaultdict(list)
for i, label in enumerate(labels):
    cluster_data[label].append(df['filename'].iloc[i])

# Générer les utilisateurs virtuels
def generate_virtual_users(num_users, seed=42):
    np.random.seed(seed)
    virtual_users = []
    for _ in range(num_users):
        user = {}
        user['user_id'] = f'user_{len(virtual_users)}'

        # Sélectionner des clusters préférés aléatoirement
        preferred_clusters = np.random.choice(list(cluster_data.keys()), size=np.random.randint(1, 4), replace=False)
        user['preferred_clusters'] = preferred_clusters

        # Sélectionner des chansons aléatoires dans les clusters préférés
        user_songs = []
        for cluster in preferred_clusters:
            user_songs.extend(np.random.choice(cluster_data[cluster], size=np.random.randint(5, 20), replace=False))
        user['listened_songs'] = user_songs

        # Attribuer des notes aléatoires aux chansons
        user['song_ratings'] = {song: np.random.randint(1, 6) for song in user_songs}

        virtual_users.append(user)
    return virtual_users

# Générer 10 utilisateurs virtuels
virtual_users = generate_virtual_users(10)

# Créer un tableau des utilisateurs et des chansons écoutées
all_songs = set()
for user in virtual_users:
    all_songs.update(user['listened_songs'])

user_song_table = pd.DataFrame(columns=['user_id'] + list(all_songs))

for user in virtual_users:
    user_row = {'user_id': user['user_id']}
    for song in all_songs:
        user_row[song] = user['song_ratings'].get(song, 0)
    user_song_table = user_song_table.append(user_row, ignore_index=True)

print(user_song_table)

  user_id reggae.00084.7.wav country.00039.1.wav classical.00090.0.wav  \
0  user_0                  0                   0                     0   
1  user_1                  0                   0                     0   
2  user_2                  0                   0                     0   
3  user_3                  0                   3                     0   
4  user_4                  3                   0                     0   
5  user_5                  0                   0                     0   
6  user_6                  0                   0                     2   
7  user_7                  0                   0                     0   
8  user_8                  0                   0                     0   
9  user_9                  0                   0                     0   

  rock.00070.1.wav hiphop.00064.6.wav disco.00014.1.wav disco.00058.1.wav  \
0                0                  0                 0                 0   
1                0             

In [21]:
import numpy as np
import pandas as pd
from sklearn.cluster import KMeans
from collections import defaultdict

# Charger les données
df = pd.read_csv('/kaggle/input/gtzan-dataset-music-genre-classification/Data/features_3_sec.csv')

# Sélectionner les caractéristiques pertinentes
features = ['chroma_stft_mean', 'rms_mean', 'spectral_centroid_mean', 'spectral_bandwidth_mean']
X = df[features].values

# Effectuer le clustering
n_clusters = 10
kmeans = KMeans(n_clusters=n_clusters, random_state=42)
labels = kmeans.fit_predict(X)

# Regrouper les chansons par cluster
cluster_data = defaultdict(list)
for i, label in enumerate(labels):
    cluster_data[label].append(df['filename'].iloc[i])

# Générer les utilisateurs virtuels
def generate_virtual_users(num_users, seed=42):
    np.random.seed(seed)
    virtual_users = []
    for _ in range(num_users):
        user = {}
        user['user_id'] = f'user_{len(virtual_users)}'
        # Sélectionner des clusters préférés aléatoirement
        preferred_clusters = np.random.choice(list(cluster_data.keys()), size=np.random.randint(1, 4), replace=False)
        user['preferred_clusters'] = preferred_clusters
        # Sélectionner des chansons aléatoires dans les clusters préférés
        user_songs = []
        for cluster in preferred_clusters:
            user_songs.extend(np.random.choice(cluster_data[cluster], size=np.random.randint(5, 20), replace=False))
        user['listened_songs'] = user_songs
        # Attribuer des notes aléatoires aux chansons
        user['song_ratings'] = {song: np.random.randint(1, 6) for song in user_songs}
        virtual_users.append(user)
    return virtual_users

# Générer 10 utilisateurs virtuels
virtual_users = generate_virtual_users(250)

# Créer un tableau des utilisateurs et des chansons écoutées
all_songs = df['filename'].unique()
user_song_table = pd.DataFrame(columns=['user_id'] + list(all_songs))
for user in virtual_users:
    user_row = {'user_id': user['user_id']}
    for song in all_songs:
        user_row[song] = user['song_ratings'].get(song, 0)
    user_song_table = user_song_table.append(user_row, ignore_index=True)

# Compter le nombre de colonnes
num_columns = len(user_song_table.columns)
print(f"Le tableau 'user_song_table' a {num_columns} colonnes.")

Le tableau 'user_song_table' a 9991 colonnes.


In [22]:
user_song_table.to_csv('/kaggle/working/user_song_table.csv', index=False)

In [23]:
rating_counts = user_song_table.iloc[:, 1:].values.flatten()
rating_counts = pd.Series(rating_counts).value_counts()
print("Nombre de fois que chaque note est utilisée :")
print(rating_counts)

Nombre de fois que chaque note est utilisée :
0    2491743
3       1184
5       1180
4       1156
1       1122
2       1115
dtype: int64


In [24]:
import pandas as pd
import numpy as np
import random

# Liste des pays élargie avec des proportions approximatives
countries = {
    'USA': 20,
    'Canada': 10,
    'France': 15,
    'Germany': 10,
    'UK': 10,
    'Australia': 5,
    'India': 20,
    'Brazil': 10,
    'Japan': 10,
    'South Korea': 5,
    'Morocco': 15,  # Ajout du Maroc
}

# Générer une liste pondérée des pays
country_list = [country for country, weight in countries.items() for _ in range(weight)]

# Sexes (avec une distribution réaliste)
genders = ['Male', 'Female']
gender_weights = [0.45, 0.45]  # Pondération réaliste

# Noms élargis
first_names = [
    'Alex', 'Taylor', 'Jordan', 'Morgan', 'Casey', 'Chris', 'Sam', 'Jamie', 'Drew', 'Riley',  # Générique
    'Ali', 'Youssef', 'Amal', 'Amina', 'Fatima', 'Mohammed', 'Ahmed', 'Leila', 'Khadija', 'Hassan',  # Maroc
    'Jean', 'Sophie', 'Marie', 'Thomas', 'Lucas', 'Emma', 'Manon', 'Pierre', 'Julien', 'Charlotte'  # France
]
last_names = [
    'Smith', 'Johnson', 'Brown', 'Williams', 'Jones', 'Garcia', 'Davis', 'Martinez', 'Clark', 'Lopez',  # Générique
    'El Haddad', 'Benchekroun', 'Bennani', 'El Mansouri', 'Fassi', 'Chakib', 'Zemmouri', 'Raoui',  # Maroc
    'Dupont', 'Durand', 'Morel', 'Laurent', 'Simon', 'Roux', 'Fournier', 'Girard', 'Mercier'  # France
]

# Générer un mot de passe aléatoire
def generate_password(length=8):
    chars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()"
    return ''.join(random.choices(chars, k=length))

# Génération des utilisateurs
def generate_users(num_users=250, seed=42):
    np.random.seed(seed)
    random.seed(seed)
    users = []
    for user_id in range(num_users):
        country = random.choice(country_list)  # Sélectionner un pays proportionnellement
        age = np.random.randint(18, 35) if country == 'Morocco' else np.random.randint(18, 70)
        user = {
            "user_id": f"user_{user_id}",
            "name": f"{random.choice(first_names)} {random.choice(last_names)}",
            "password": generate_password(),
            "sex": random.choices(genders, weights=gender_weights, k=1)[0],
            "age": age,
            "country": country,
        }
        users.append(user)
    return pd.DataFrame(users)

# Générer les utilisateurs
num_users = 250  # Augmentons à 500 pour une meilleure représentation
user_data = generate_users(num_users=num_users)

# Sauvegarder dans un fichier CSV
output_path = 'users.csv'
user_data.to_csv(output_path, index=False)

print(f"{num_users} utilisateurs générés et sauvegardés dans '{output_path}'.")


250 utilisateurs générés et sauvegardés dans 'users.csv'.


In [25]:
user_data.to_csv('/kaggle/working/user_datatt.csv', index=False)

In [26]:
user_data

Unnamed: 0,user_id,name,password,sex,age,country
0,user_0,Alex Roux,tq1W#gEc,Male,56,Canada
1,user_1,Hassan Smith,OZYEGu!2,Male,32,Morocco
2,user_2,Ali Clark,l&ygg9R6,Female,60,Japan
3,user_3,Charlotte Bennani,fvT@AnfV,Female,25,France
4,user_4,Pierre Martinez,!BG8lzWY,Female,38,Canada
...,...,...,...,...,...,...
245,user_245,Fatima Benchekroun,QJ3o#2ra,Male,37,UK
246,user_246,Leila Benchekroun,TXay2B1w,Male,53,UK
247,user_247,Amina Chakib,*(kudq8D,Female,36,UK
248,user_248,Thomas Brown,PKTCQeVj,Male,20,Morocco
