In [2]:
import json
import jieba
import numpy as np
import pandas as pd
from collections import Counter
from sklearn.model_selection import train_test_split

In [3]:
rst = []
with open("./../data/music.json") as f:
    for i in f.readlines():
        i = json.loads(i)
        rst.append(i)
rst = pd.DataFrame(rst)[["singer", "geci", "album"]]


def preprocess_geci(x):
    lines = []
    for line in x:
        line = line.replace(",", "") \
                .replace(".", "") \
                .replace("，", "") \
                .replace("。", "") \
                .replace("《", "") \
                .replace("》", "") 

        lines.append(line)
    return "，".join(lines)

def is_character(x):
    res = True
    for w in x:
        if not '\u4e00' <= w <= '\u9fff':
            res = False
    return res

        
rst["geci"] = rst["geci"].apply(preprocess_geci)
rst["album"] = rst["album"].apply(lambda x:x.replace("《", "").replace("》", ""))
rst["mark"] = rst["singer"].apply(is_character) | rst["album"].apply(is_character)
rst = rst.query("mark == True").reset_index(drop=True)

corpus = "".join(rst["geci"])
keys = list(Counter(corpus).keys()) + [' ']

num_to_char = {k:v for k,v in enumerate(keys)}
char_to_num = {v:k for k,v in enumerate(keys)}

In [187]:
from sklearn.utils import shuffle
import torch
from torch import utils
from torch import nn
from torch.nn.functional import one_hot

class musicDataSet(utils.data.Dataset):
    
    def __init__(self, dataframe, seq_len=3, time_len=100, char_to_num=char_to_num):
        self.dataframe = dataframe.reset_index(drop=True)
        self.seq_len = seq_len
        self.time_len = time_len
        self.char_to_num = char_to_num
        
    def __len__(self):
        return self.dataframe.shape[0]
    
    @staticmethod
    def __convert_onehot__(list_corpus):
        array_corpus = np.array(list_corpus)
        rst_array = np.zeros((array_corpus.size, max(char_to_num.values()) + 1))
        rst_array[np.arange(array_corpus.size), array_corpus] = 1
        return rst_array.flatten()
        
    
    def __getitem__(self, idx):
        corpus_item = self.dataframe.loc[idx, 'geci']
        max_len = len(corpus_item)
        X, y = [], []
        for i in range(self.time_len):
            if i >= max_len - self.seq_len:
                tmp_list = []
                for num in range(self.seq_len):
                    tmp_list.append(char_to_num[" "])
                    
                # X.append(self.__convert_onehot__(tmp_list))
                y.append(self.__convert_onehot__(char_to_num[" "]))
                X.append(tmp_list)
                # y.append(char_to_num[" "])

            else:
                tmp_list = []
                for num in range(self.seq_len):
                    tmp_list.append(char_to_num[corpus_item[i+num]])
                    
                # X.append(self.__convert_onehot__(tmp_list))
                y.append(self.__convert_onehot__(char_to_num[corpus_item[i+self.seq_len]]))
                X.append(tmp_list)
                # y.append(char_to_num[corpus_item[i+self.seq_len]])

        X, y = torch.from_numpy(np.array(X)), torch.from_numpy(np.array(y))
        
        # return X, y
        return X, y
    

train_data, test_data = train_test_split(rst, train_size=0.9, shuffle=True, random_state=2022)
train_dataset = musicDataSet(train_data, seq_len=3, time_len=100)
test_dataset = musicDataSet(test_data, seq_len=3, time_len=100)

train_dataloader = utils.data.DataLoader(dataset=train_dataset, batch_size=32, shuffle=True)
test_dataloader = utils.data.DataLoader(dataset=test_dataset, batch_size=32, shuffle=False)


In [188]:
class lyricModel(nn.Module):
    
    def __init__(self, n_vocab, embedding_dim, seq_len, hidden_size=128, num_layers=2):
        super(lyricModel, self).__init__()
        
        self.embed = nn.Embedding(
                num_embeddings=n_vocab, 
                embedding_dim=embedding_dim
            )
        
        self.backbone = nn.GRU(
                input_size=embedding_dim*seq_len, 
                hidden_size=hidden_size,
                num_layers=num_layers,
                dropout=0.3,
                batch_first=True,
            )
        
        self.tail = nn.Linear(hidden_size, n_vocab)
        
    def forward(self, x):
        dim1, dim2, _ = x.shape
        x = self.embed(x).reshape(dim1, dim2, -1)
        x, h = self.backbone(x)
        x = self.tail(x)
        return x
    
    
n_vocab = len(char_to_num.keys()) + 1
embedding_dim = 128
lyric_model = lyricModel(n_vocab=n_vocab, embedding_dim=embedding_dim, seq_len=3)

In [189]:
for X, y in train_dataloader:
    X, y = X.permute(1, 0, 2), y.permute(1, 0, 2)
    pred = lyric_model(X)
    print(pred.shape, y.shape)
    break



torch.Size([100, 32, 9273]) torch.Size([100, 32, 9273])
