In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
import sklearn.metrics 
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from langdetect import detect

In [4]:
#leemos datos 
data = pd.read_csv('itamplify_train.csv')

In [5]:
#Detectamos el idioma y creamos una nueva columna 
language = []
for i in range(len(data)):
    try:
        lan = detect(data['track_name'][i])
        language.append(lan)
    except:
        # Handle the case where language detection fails
        language.append("Unknown")

data['language'] = language

In [6]:
# Define the language array
languages = ['en', 'pt', 'it', 'hr', 'vi', 'id', 'hu', 'ro', 'es', 'et', 'nl',
             'sw', 'no', 'fr', 'tl', 'cs', 'so', 'tr', 'ca', 'da', 'sk', 'de',
             'fi', 'sl', 'pl', 'sv', 'cy', 'lt', 'af', 'Unknown', 'lv', 'ja',
             'sq', 'ru', 'ko', 'uk', 'mk', 'zh-cn', 'zh-tw']

# Get dummy variables
language_dummies = pd.get_dummies(data['language'])

# Ensure all languages in the array are represented in the dummy variables
for language in languages:
    if language not in language_dummies.columns:
        language_dummies[language] = 0

# Reorder columns to match original languages array
language_dummies = language_dummies[languages]

# Assign dummy variables back to DataFrame
data = pd.concat([data, language_dummies], axis=1)

In [7]:
data = data.drop(['X', 'track_id', 'artists', 'album_name', 'track_name','language'], axis=1)

In [8]:
train, test = train_test_split(data, 
                                stratify=data['track_genre'],
                                test_size=0.2,
                                random_state=42)

train, val = train_test_split(train,
                                stratify=train['track_genre'],
                                test_size=0.2,
                                random_state=42)


X_train = train.drop(columns=['track_genre'])
y_train = train['track_genre']
X_test = test.drop(columns=['track_genre'])
y_test = test['track_genre']



In [9]:
#RANDOM FOREST 
rfc = RandomForestClassifier(n_estimators=100, random_state=42)


rfc.fit(X_train, y_train)


predictions = rfc.predict(X_test)


accuracy = accuracy_score(y_test, predictions)
precision = precision_score(y_test, predictions, average='weighted')
recall = recall_score(y_test, predictions, average='weighted')
f1 = f1_score(y_test, predictions, average='weighted')

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)

print("\nClassification Report:")
print(classification_report(y_test, predictions))

Accuracy: 0.6076178960096735
Precision: 0.5985187637637999
Recall: 0.6076178960096735
F1 Score: 0.5899203404250069

Classification Report:
              precision    recall  f1-score   support

    afrobeat       0.60      0.26      0.37        68
 alternative       0.46      0.31      0.37        72
       blues       0.38      0.21      0.27       120
   classical       0.77      0.66      0.71        50
     country       0.56      0.24      0.34        58
       disco       0.55      0.60      0.58       139
        folk       0.45      0.52      0.48       153
        funk       0.74      0.69      0.71       126
 heavy-metal       0.62      0.82      0.71        84
     hip-hop       0.56      0.83      0.67       115
        jazz       0.54      0.28      0.37        54
  psych-rock       0.53      0.65      0.58       172
   reggaeton       0.59      0.51      0.55        74
       samba       0.80      0.90      0.85       195
      techno       0.72      0.79      0.75       