In [79]:
# Setup:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
import pandas as pd
from sklearn.preprocessing import LabelBinarizer
from torch.utils.data import Dataset
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
from torchvision import datasets, transforms
from torch.utils.data import DataLoader, TensorDataset

In [80]:
test_set = pd.read_csv('/Users/sanzhar123/Downloads/test.csv')
training_set = pd.read_csv('/Users/sanzhar123/Downloads/train.csv')

In [81]:
means = None
stdevs = None

# Define preprocessing transform
def conversion_transform(df, is_test = False):
    # drop text columns
    new_df = df.drop(columns=['track_href', 'uri', 'type', 'analysis_url'])

    if is_test == True:
        new_df = new_df.drop(columns=['ID'])

    # Handle dates 
    date_column = new_df["track_album_release_date"].astype(str)

    parsed = pd.to_datetime(date_column, format='%Y-%m-%d', errors='coerce')
    new_df["year"] = parsed.dt.year
    new_df["month"] = parsed.dt.month
    new_df["day"] = parsed.dt.day

    # For year-only dates, extract year and default month/day to 1
    year_only_mask = new_df["year"].isna()
    new_df.loc[year_only_mask, "year"] = pd.to_numeric(date_column[year_only_mask], errors='coerce')
    new_df.loc[year_only_mask, "month"] = 1
    new_df.loc[year_only_mask, "day"] = 1

    new_df = new_df.drop(columns='track_album_release_date')

    if is_test == False:
        # Binarization of the output
        column_names = list(new_df.columns.values)
        column_names.append(column_names.pop(column_names.index('Popularity_Type')))
        new_df = new_df[column_names]

        lb = LabelBinarizer()
        new_df['Popularity_Type'] = lb.fit_transform(new_df['Popularity_Type'])

    # replace all the nan values with zero
    new_df = new_df.fillna(0)

    return new_df

def normalization_transform(df, is_test = False):
    # normalize data
    new_df = df.copy()
    if is_test == False:
        new_df.iloc[:, :-1] -= means
        new_df.iloc[:, :-1] /= stdevs
    else: 
        new_df = new_df[feature_columns]
        new_df -= means
        new_df /= stdevs
    return new_df

def preprocessing(df, is_test= False):
    new_df = conversion_transform(df, is_test)
    new_df = normalization_transform(new_df, is_test)
    return new_df

# compute normalization mean and stdev
filtered_training = conversion_transform(training_set)
means = np.mean(filtered_training.iloc[:, :-1], axis=0)
stdevs = np.std(filtered_training.iloc[:, :-1], axis=0)
feature_columns = list(filtered_training.columns[:-1])

filtered_training = preprocessing(training_set)

# Save IDs
test_IDs = test_set['ID'].values

In [82]:
print((stdevs == 0).any())

False


In [83]:
# Music Dataloader
class MusicDataset(Dataset):
    """Music dataset."""

    def __init__(self, dataset, is_test, transform=None):
        """
        Arguments:
            dataset: Pandas dataframe
            transform: Transformation to data
        """

        data = dataset
        if transform:
            data = transform(dataset, is_test)

        # If it's the test set, there are no labels 
        if is_test:
            self.X = torch.tensor( data.values, dtype=torch.float32)
            self.y = None

        else:
        # example: last column is label, rest are features
            self.X = torch.tensor(
                data.iloc[:, :-1].values,
                dtype=torch.float32
            )
            self.y = torch.tensor(
                data.iloc[:, -1].values,
                dtype=torch.long
            )

    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

In [85]:
num_features = filtered_training.shape[1]-1
model1 = nn.Sequential(
    nn.Linear(num_features, 256),
    nn.GELU(),
    nn.Dropout(0.25),

    nn.Linear(256, 128),
    nn.GELU(),
    nn.Dropout(0.25),

    nn.Linear(128, 32),
    nn.GELU(),
    nn.Dropout(0.25),

    nn.Linear(32,1),
    nn.Sigmoid()
)

In [86]:
optimizer = torch.optim.Adam(model1.parameters(), lr=5*1e-4)
loss_fn = nn.BCELoss()

In [87]:
# load data
training_data_loader = DataLoader(MusicDataset(training_set, is_test=False, transform=preprocessing), batch_size=32, shuffle=True)

In [88]:
def evaluate(model, dataset):
    # Switch to the evaluation mode: 
    model.eval()

    # Compute the predictions and compare to the true labels
    pred = torch.round(model(dataset.X))
    correct = (pred.flatten() == dataset.y).float()
    model.train() 
    
    return correct.mean().item() 

In [89]:
model1.train()
dataset = MusicDataset(training_set, is_test= False, transform=preprocessing)
for epoch in range(70):
    for batch_idx, (data, target) in enumerate(training_data_loader):
        # Erase accumulated gradients
        optimizer.zero_grad()

        # Forward pass
        output = model1(data)

        # Calculate loss
        loss = loss_fn(output.squeeze(), target.float())

        # Backward pass
        loss.backward()
        
        # Weight update
        optimizer.step()

    # Track loss each epoch
    print('Train Epoch: %d  Loss: %.4f' % (epoch + 1,  loss.item()))
    print('Accuracy on the training set is', evaluate(model1, dataset))

Train Epoch: 1  Loss: 0.4550
Accuracy on the training set is 0.6896997690200806
Train Epoch: 2  Loss: 0.4879
Accuracy on the training set is 0.7160972952842712
Train Epoch: 3  Loss: 0.5079
Accuracy on the training set is 0.7360248565673828
Train Epoch: 4  Loss: 0.5181
Accuracy on the training set is 0.7437888383865356
Train Epoch: 5  Loss: 0.4369
Accuracy on the training set is 0.7479296326637268
Train Epoch: 6  Loss: 0.4899
Accuracy on the training set is 0.7440476417541504
Train Epoch: 7  Loss: 0.6677
Accuracy on the training set is 0.7406832575798035
Train Epoch: 8  Loss: 0.4133
Accuracy on the training set is 0.751035213470459
Train Epoch: 9  Loss: 0.4074
Accuracy on the training set is 0.7531055808067322
Train Epoch: 10  Loss: 0.6059
Accuracy on the training set is 0.7549172043800354
Train Epoch: 11  Loss: 0.5098
Accuracy on the training set is 0.7567287683486938
Train Epoch: 12  Loss: 0.5273
Accuracy on the training set is 0.7580227851867676
Train Epoch: 13  Loss: 0.6146
Accuracy

In [90]:
# Now let's produce the output on the test_set 
torch_test = MusicDataset(test_set, is_test=True, transform=preprocessing)

model1.eval()
with torch.no_grad():
    predictions = 1 - model1(torch_test.X).squeeze().numpy() # apparently the predictions are flipped lol

submission = pd.DataFrame({
    'ID': test_IDs,
    'Popularity_Type': predictions
})
submission.to_csv('submission.csv', index=False)