In [15]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor


In [3]:
df = pd.read_csv('labeled_lyrics_cleaned.csv', nrows=20000)

In [4]:
df.head(10)

Unnamed: 0.1,Unnamed: 0,artist,seq,song,label
0,0,Elijah Blake,"No, no\r\nI ain't ever trapped out the bando\r...",Everyday,0.626
1,1,Elijah Blake,"The drinks go down and smoke goes up, I feel m...",Live Till We Die,0.63
2,2,Elijah Blake,She don't live on planet Earth no more\r\nShe ...,The Otherside,0.24
3,3,Elijah Blake,"Trippin' off that Grigio, mobbin', lights low\...",Pinot,0.536
4,4,Elijah Blake,"I see a midnight panther, so gallant and so br...",Shadows & Diamonds,0.371
5,5,Elijah Blake,I just want to ready your mind\r\n'Cause I'll ...,Uno,0.321
6,6,Elijah Harris,To believe\r\nOr not to believe\r\nThat is the...,Girlfriend (Main),0.601
7,7,Elis,Dieses ist lange her.\r\nDa ich deine schmalen...,Abendlied,0.333
8,8,Elis,A child is born\r\nOut of the womb of a mother...,Child,0.506
9,9,Elis,Out of the darkness you came \r\nYou looked so...,Come to Me,0.179


In [5]:
# Here 'seq' is lyrics and 'label' is valence
X = df['seq']
y = df['label']

In [6]:
#the data is split 80:20 ratio for train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# to change the lyrics text to a numerical representation using TfidfVectorizer

tfidf = TfidfVectorizer(max_features=10000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [11]:
#model is created by fitting the normalised dataset
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train_tfidf, y_train)

RandomForestRegressor(random_state=42)

In [12]:
y_pred = rf.predict(X_test_tfidf)

In [14]:
#MSE and R2 values are calculated
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean Squared Error:', mse)
print('R2 Score:', r2)

Mean Squared Error: 0.048265756679172316
R2 Score: 0.20883504044222767


In [16]:
import pickle
with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf, f)