In [None]:
import pandas as pd
import numpy as np
from transformers import DistilBertTokenizer, DistilBertModel
import torch
import nltk
from nltk.corpus import stopwords
import string

from sklearn.metrics import mean_squared_error

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# 1. Load the Year Data

In [None]:
# Load data
df_year = pd.read_csv('/content/drive/MyDrive/Advanced_Data_Analytics_for_Business/FinalProject/year_sampled_english_songs.csv')

**year_sampled_english_songs.csv:**

- year_df=df[df["year"]>=1960]
- english_df = year_df[year_df['language'] == 'en']
- english_df=english_df[english_df['tag'] != 'misc']
- views >= 1000
- word counts >= 100
- max_rows_per_year = 500
- drop year 2023 because just 23 obs

# 2. Preprocessing

In [None]:
# Sample 300 rows per year
df_year = df_year.groupby('year').apply(lambda x: x.sample(n=300, random_state=42)).reset_index(drop=True)

In [None]:
df_year.head()

Unnamed: 0,title,tag,artist,year,views,features,lyrics,id,language_cld3,language_ft,language
0,John Riley,rock,Joan Baez,1960,1663,{},[Verse]\nFair young maid all in a garden\nStra...,219488,en,en,en
1,Burning Bridges,pop,Jack Scott,1960,1103,{},Found some letters you wrote me this morning\n...,979387,en,en,en
2,Think,rock,James Brown & The Famous Flames,1960,4020,{},[Chorus]\nThink--about the good things\nThink-...,406730,en,en,en
3,Rake And Ramblin’ Boy,rock,Joan Baez,1960,1277,{},"[Chorus]\nWell, I'm a rake and ramblin' boy\nT...",219491,en,en,en
4,Ride on Josephine,rock,Bo Diddley,1960,1318,{},"[Chorus]\nRide on Josephine, ride on\nRide on ...",455392,en,en,en


In [None]:
df_year.value_counts('year')

year
1960    300
2007    300
1994    300
1995    300
1996    300
       ... 
1986    300
1987    300
1988    300
1989    300
2022    300
Length: 63, dtype: int64

In [None]:
# Function for cleaning text data
def clean_text(text):
    # Remove English stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word.lower() not in stop_words])

    # Remove specified special characters
    special_chars = ['\n', '[', ']']
    for char in special_chars:
        text = text.replace(char, '')

    # Remove other punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    return text

# 3. DistilBERT Embeddings

In [None]:
def get_bert_embeddings(text, max_len=300):
    # Tokenize the text
    tokens = tokenizer.tokenize(tokenizer.decode(tokenizer.encode(text, add_special_tokens=True)))

    # Truncate or split the tokens to fit the specified maximum sequence length
    if len(tokens) > max_len:
        # Truncate the tokens
        tokens = tokens[:max_len]
    else:
        # No need to truncate, but if still longer than max_len, split into chunks
        tokens = [tokens[i:i + max_len] for i in range(0, len(tokens), max_len)]

    # Convert tokens to input IDs
    input_ids = [tokenizer.convert_tokens_to_ids(chunk) for chunk in tokens]

    # Flatten the list if it's nested
    input_ids = [item for sublist in input_ids for item in sublist] if isinstance(input_ids[0], list) else input_ids

    # Create attention mask
    attention_mask = [1] * len(input_ids)

    # Convert to PyTorch tensors
    input_ids = torch.tensor([input_ids])
    attention_mask = torch.tensor([attention_mask])

    # Get BERT embeddings
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)

    embeddings = outputs.last_hidden_state.mean(dim=1).squeeze().detach().numpy()
    return embeddings


In [None]:
# Load pre-trained BERT tokenizer and model
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')
model = DistilBertModel.from_pretrained('distilbert-base-uncased')

# Clean and preprocess the lyrics
df_year['cleaned_lyrics'] = df_year['lyrics'].apply(clean_text)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

In [None]:
# Tokenize and get BERT embeddings
df_year['bert_embeddings'] = df_year['cleaned_lyrics'].apply(lambda x: get_bert_embeddings(x, max_len=300))

Token indices sequence length is longer than the specified maximum sequence length for this model (527 > 512). Running this sequence through the model will result in indexing errors


In [None]:
import pickle

# Save BERT embeddings to a file
with open('/content/drive/MyDrive/Advanced_Data_Analytics_for_Business/FinalProject/Models/bert_embeddings_300eachyear.pkl', 'wb') as file:
    pickle.dump(df_year['bert_embeddings'].tolist(), file)

In [None]:
#import pickle

# Load BERT embeddings from the saved file
#with open('/content/drive/MyDrive/Advanced_Data_Analytics_for_Business/FinalProject/Models/bert_embeddings_300eachyear.pkl', 'rb') as file:
#    bert_embeddings_loaded = pickle.load(file)

In [None]:
# Add loaded embeddings to DataFrame
df_year['bert_embeddings'] = bert_embeddings_loaded

# Display the resulting DataFrame
print(df_year.head())

                   title   tag                           artist  year  views  \
0             John Riley  rock                        Joan Baez  1960   1663   
1        Burning Bridges   pop                       Jack Scott  1960   1103   
2                  Think  rock  James Brown & The Famous Flames  1960   4020   
3  Rake And Ramblin’ Boy  rock                        Joan Baez  1960   1277   
4      Ride on Josephine  rock                       Bo Diddley  1960   1318   

  features                                             lyrics      id  \
0       {}  [Verse]\nFair young maid all in a garden\nStra...  219488   
1       {}  Found some letters you wrote me this morning\n...  979387   
2       {}  [Chorus]\nThink--about the good things\nThink-...  406730   
3       {}  [Chorus]\nWell, I'm a rake and ramblin' boy\nT...  219491   
4       {}  [Chorus]\nRide on Josephine, ride on\nRide on ...  455392   

  language_cld3 language_ft language  \
0            en          en       en   


# 4. Train Test Split

In [None]:
from sklearn.model_selection import train_test_split

# Split the dataset into features (X) and labels (y)
X = df_year['bert_embeddings'].tolist()
y = df_year['year']

# Perform a stratified 80/20 split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Display the size of the training and testing sets
print(f"Training set size: {len(X_train)} samples")
print(f"Testing set size: {len(X_test)} samples")

Training set size: 15120 samples
Testing set size: 3780 samples


# 5. Model Training

## 5.1 Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor

X_train = np.vstack(X_train)
X_test = np.vstack(X_test)

# Build and train the Random Forest model
rf_model = RandomForestRegressor(n_estimators=5, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on the test set
y_pred_rf = rf_model.predict(X_test)

# Calculate RMSE
rmse_rf = np.sqrt(mean_squared_error(y_test, y_pred_rf))
print(f"Root Mean Squared Error (RMSE): {rmse_rf}")

Root Mean Squared Error (RMSE): 16.450771656081695


## 5.2 Support Vector Machines

In [None]:
from sklearn.svm import SVR

X_train = np.vstack(X_train)
X_test = np.vstack(X_test)

# Build and train the Support Vector Machine model (SVR)
svm_model = SVR(kernel='linear')

# Train the SVM model
svm_model.fit(X_train, y_train)

# Predict on the test set
y_pred_svm = svm_model.predict(X_test)

# Calculate RMSE
rmse_svm = np.sqrt(mean_squared_error(y_test, y_pred_svm))
print(f"Root Mean Squared Error (RMSE): {rmse_svm}")

Root Mean Squared Error (RMSE): 14.177332777919734


In [None]:
from joblib import dump

# Save the trained SVM model
model_path = "/content/drive/MyDrive/Advanced_Data_Analytics_for_Business/FinalProject/Models/svm_model_final.pth"
dump(svm_model, model_path)

['/content/drive/MyDrive/Advanced_Data_Analytics_for_Business/FinalProject/Models/svm_model_final.pth']

## 5.3 Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

X_train = np.vstack(X_train)
X_test = np.vstack(X_test)

# Build and train the Linear Regression model
linear_model = LinearRegression()

# Train the linear regression model
linear_model.fit(X_train, y_train)

# Predict on the test set
y_pred_ln = linear_model.predict(X_test)

# Calculate RMSE
rmse_ln = np.sqrt(mean_squared_error(y_test, y_pred_ln))
print(f"Root Mean Squared Error (RMSE): {rmse_ln}")

Root Mean Squared Error (RMSE): 14.282376165200215


## 5.4 XGBoost

In [None]:
import xgboost as xgb

# Build and train the XGBoost model
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=50, random_state=42)
xgb_model.fit(X_train, y_train)

# Predict on the test set
y_pred_xgb = xgb_model.predict(X_test)

# Calculate RMSE
rmse_xgb = np.sqrt(mean_squared_error(y_test, y_pred_xgb))
print(f"XGBoost Root Mean Squared Error (RMSE): {rmse_xgb}")

XGBoost Root Mean Squared Error (RMSE): 15.261744059890663


## 5.5 K-Nearest Neighbors

In [None]:
from sklearn.neighbors import KNeighborsRegressor

# Build and train the KNN model
knn_model = KNeighborsRegressor(n_neighbors=5)
knn_model.fit(X_train, y_train)

# Predict on the test set
y_pred_knn = knn_model.predict(X_test)

# Calculate RMSE
rmse_knn = np.sqrt(mean_squared_error(y_test, y_pred_knn))
print(f"K-Nearest Neighbors Root Mean Squared Error (RMSE): {rmse_knn}")

K-Nearest Neighbors Root Mean Squared Error (RMSE): 16.825912870193516


## 5.6 Neural Network

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Standardize the input features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Build and train a neural network with L2 regularization and early stopping
model = Sequential()
model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001), input_shape=(X_train_scaled.shape[1],)))
model.add(Dropout(0.1))
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dense(1, activation='linear', kernel_regularizer=l2(0.001)))
custom_optimizer = Adam(learning_rate=0.001)  # Adjust the learning rate as needed
model.compile(optimizer=custom_optimizer, loss='mean_squared_error')

# Define early stopping criteria
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train the model with early stopping
model.fit(X_train_scaled, y_train, epochs=30, batch_size=16, validation_split=0.1, verbose=2, callbacks=[early_stopping])

# Predict on the test set
y_pred_nn = model.predict(X_test_scaled).flatten()

# Calculate RMSE
rmse_nn = np.sqrt(mean_squared_error(y_test, y_pred_nn))
print(f"Neural Network Root Mean Squared Error (RMSE): {rmse_nn}")


Epoch 1/30
851/851 - 7s - loss: 341492.5625 - val_loss: 9506.8125 - 7s/epoch - 8ms/step
Epoch 2/30
851/851 - 5s - loss: 14902.2197 - val_loss: 6986.2856 - 5s/epoch - 5ms/step
Epoch 3/30
851/851 - 3s - loss: 13127.7793 - val_loss: 5465.7690 - 3s/epoch - 3ms/step
Epoch 4/30
851/851 - 3s - loss: 12139.8193 - val_loss: 5359.8999 - 3s/epoch - 3ms/step
Epoch 5/30
851/851 - 2s - loss: 11416.8428 - val_loss: 5404.3076 - 2s/epoch - 3ms/step
Epoch 6/30
851/851 - 3s - loss: 11151.1699 - val_loss: 5359.3652 - 3s/epoch - 4ms/step
Epoch 7/30
851/851 - 3s - loss: 10827.7422 - val_loss: 6427.1421 - 3s/epoch - 4ms/step
Epoch 8/30
851/851 - 3s - loss: 10822.3799 - val_loss: 5804.3491 - 3s/epoch - 3ms/step
Epoch 9/30
851/851 - 2s - loss: 10578.2812 - val_loss: 5094.9160 - 2s/epoch - 3ms/step
Epoch 10/30
851/851 - 2s - loss: 10090.8408 - val_loss: 4576.9111 - 2s/epoch - 3ms/step
Epoch 11/30
851/851 - 3s - loss: 10196.3916 - val_loss: 3835.6887 - 3s/epoch - 4ms/step
Epoch 12/30
851/851 - 3s - loss: 9932.85

# 6. Hyperparameter Tuning for SVM

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR

X_train = np.vstack(X_train)
X_test = np.vstack(X_test)

# Define the parameter grid for SVM
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10],
    'gamma': ['scale', 'auto'],
    'epsilon': [0.1, 0.2, 0.5]
}

# Build the Support Vector Machine model (SVR)
svm_model = SVR()

# Perform Grid Search
grid_search = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=3, n_jobs=-1, verbose=2)
grid_search.fit(X_train, y_train)

# Get the best parameters from the grid search
best_params = grid_search.best_params_
print("Best Parameters:", best_params)

# Use the best model to predict on the test set
best_svm_model = grid_search.best_estimator_
y_pred = best_svm_model.predict(X_test)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print(f"Root Mean Squared Error (RMSE): {rmse}")

# 7. Evaluation and Prediction based on best Model SVM

In [None]:
from sklearn.metrics import r2_score

# Evaluate the model on the test set
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Root Mean Squared Error (RMSE): {rmse}")
print(f"R-squared (R2): {r2}")

Mean Squared Error (MSE): 200.9967646958773
Root Mean Squared Error (RMSE): 14.177332777919734
R-squared (R2): 0.39214688096004846


In [None]:
prediction_difference = y_pred_svm - y_test

# Create a DataFrame to display the true values, predicted values, and differences
prediction_comparison = pd.DataFrame({
    'True Values': y_test,
    'Predicted Values': y_pred_svm,
    'Difference': prediction_difference
})

# Display the DataFrame
print(prediction_comparison)

# Calculate the mean of the differences
mean_difference = prediction_difference.mean()

# Display the mean difference
print(f"Mean Difference between Predicted and True Values: {mean_difference}")


       True Values  Predicted Values  Difference
5915          1979       1984.136956    5.136956
731           1962       1981.478921   19.478921
10631         1995       1993.002619   -1.997381
15421         2011       2001.163535   -9.836465
3581          1971       1965.563814   -5.436186
...            ...               ...         ...
13457         2004       2005.032943    1.032943
18518         2021       2008.001472  -12.998528
12605         2002       1995.393442   -6.606558
16377         2014       2014.652561    0.652561
15321         2011       2007.658370   -3.341630

[3780 rows x 3 columns]
Mean Difference between Predicted and True Values: -0.8270843249480565


In [None]:
# Round the predicted values to the nearest integer
rounded_predictions = np.round(y_pred_svm)

# Create a DataFrame to compare the rounded predictions with the true values
rounded_prediction_comparison = pd.DataFrame({
    'True Values': y_test,
    'Rounded Predicted Values': rounded_predictions,
    'Difference': rounded_predictions - y_test
})

# Display the DataFrame
print(rounded_prediction_comparison)

       True Values  Rounded Predicted Values  Difference
5915          1979                    1984.0         5.0
731           1962                    1981.0        19.0
10631         1995                    1993.0        -2.0
15421         2011                    2001.0       -10.0
3581          1971                    1966.0        -5.0
...            ...                       ...         ...
13457         2004                    2005.0         1.0
18518         2021                    2008.0       -13.0
12605         2002                    1995.0        -7.0
16377         2014                    2015.0         1.0
15321         2011                    2008.0        -3.0

[3780 rows x 3 columns]
