In [30]:
#!pip install textblob
#!pip install langdetect
#import nltk
#nltk.download('wordnet')

In [8]:
#imports
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
import re
import random

from scipy import sparse
from scipy.sparse import csr_matrix, vstack
from textblob import TextBlob
from langdetect import detect_langs
import pickle
from datetime import datetime

from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from gensim.models import Word2Vec
import multiprocessing
import gensim
from gensim.utils import simple_preprocess

In [9]:
songdata = pd.read_csv('./songdata.csv')

In [10]:
df = pd.read_pickle("./genres.pkl")
df = pd.merge(songdata, df, on=['artist','artist'])
df = df[df.iloc[:,5:].any(axis = 1)]

In [11]:
df.head()

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face \nAnd...","[europop, swedish pop]",0,0,1,0,0,0,0
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please \nTouch me gentl...","[europop, swedish pop]",0,0,1,0,0,0,0
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go \nWhy I had t...,"[europop, swedish pop]",0,0,1,0,0,0,0
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51489 entries, 0 to 57595
Data columns (total 12 columns):
artist               51489 non-null object
song                 51489 non-null object
link                 51489 non-null object
text                 51489 non-null object
genres               51489 non-null object
rock                 51489 non-null int32
singer-songwriter    51489 non-null int32
pop                  51489 non-null int32
metal                51489 non-null int32
folk                 51489 non-null int32
country              51489 non-null int32
hip hop / rap        51489 non-null int32
dtypes: int32(7), object(5)
memory usage: 3.7+ MB


We have a multilabel classification problem according to https://scikit-learn.org/stable/modules/multiclass.html

Multilabel- 

"A multiclass multioutput target where each output is binary. This may be represented as a 2d (dense) array or sparse matrix of integers, such that each column is a separate binary target, where positive labels are indicated with 1 and negative labels are usually -1 or 0. Sparse multilabel targets are not supported everywhere that dense multilabel targets are supported."

"Valid representation of multilabel y is either dense (or sparse) binary matrix of shape (n_samples, n_classes). Each column represents a class. The 1’s in each row denote the positive classes a sample has been labelled with."

In [None]:
#here is y in array form if you want it.
#this would have happened in a train_test_split anyway
y = np.array(df.iloc[:,5:])

Some text is in round brackets while some is in square brackets. I wanted to examine what that text looked like.

In [13]:
#examining text in round brackets
round_brackets = sum(list(df['text'].map(lambda s: re.findall(r'\((.*?)\)',s))), [])
#Number of round brackets:
print((len(round_brackets)))


54962


In [14]:
#Viewing some text in round brackets
#These just look like normal lyrics
random.seed(0)
random.choices(round_brackets, k=15)

['I knew',
 'tell it',
 'gunshots',
 "feel like I'm back",
 'Judy',
 'we were in the park',
 "You don't own",
 'repeat to fade',
 "it's alright",
 'Originally recorded by Diamond Head',
 'or should I have my green eyes?',
 'Dark night',
 'vocals',
 'I got 1,000 hugs and kisses 4 U when U come back home, baby',
 "But that's alright"]

In [15]:
#examining text in square brackets
square_brackets = sum(list(df['text'].map(lambda s: re.findall(r'\[(.*?)\]',s))), [])
#how many instances of square brackets?
print((len(square_brackets)))



26001


In [16]:
#re.sub(pattern, repl, string, count=0, flags=0)
# remove round brackets but not text within
df['text'] = df['text'].map(lambda s: re.sub(r'\(|\)', '', s))

# remove square brackest and text within
df['text'] = df['text'].map(lambda s: re.sub(r'\[(.*?)\] ', '', s))

In [17]:
# remove line breaks
df['text'] = df['text'].map(lambda s: re.sub(r' \n|\n', '', s))

### Removing Non-English Songs

Note: This takes a while to run. 

In [18]:
#This find the probability that the word is english
def get_eng_prob(text):
    detections = detect_langs(text)
    for detection in detections:
        if detection.lang == 'en':
            return detection.prob
    return 0

#finding the probability that the text is english
df['en_prob'] = df['text'].map(get_eng_prob)

print('Number of songs in english: {}'.format(sum(df['en_prob'] >= 0.5)))
print('Number of songs that are not english: {}'.format(sum(df['en_prob'] < 0.5)))

Number of songs in english: 51119
Number of songs that are not english: 370


In [19]:
#only selecting songs that have a probability of 0.5 or higher of being in english
df = df.loc[df['en_prob'] >= 0.5]

In [20]:
df.head()

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap,en_prob
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...","[europop, swedish pop]",0,0,1,0,0,0,0,0.999997
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...","[europop, swedish pop]",0,0,1,0,0,0,0,0.999995
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,"[europop, swedish pop]",0,0,1,0,0,0,0,0.999997
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,0.999995
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,0.999997


In [21]:
#Repeating above code to make sure that the non-english songs were dropped
print('Number of songs in english: {}'.format(sum(df['en_prob'] >= 0.5)))
print('Number of songs that are not english: {}'.format(sum(df['en_prob'] < 0.5)))

Number of songs in english: 51119
Number of songs that are not english: 0


In [22]:
df = df.drop(['en_prob'], axis=1)

### Tokenization

Using nltk.tokenize to seperate the text into a list of words. All punctuation is removed. 

In [23]:
#w+ for whitespace
#tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

#creating a new column called tokens
#df['tokens'] = df['text'].map(tokenizer.tokenize)

We originally cleaned the data using the tokenization text above. After later running the word2vec, we didn't like that the words were not lowercased, and thought they could be cleaned up better. We are proceeding with the code below instead. According to documentation, this uses tokenize() internally.

https://radimrehurek.com/gensim/utils.html

In [24]:
stemmer = PorterStemmer()
def lemmatize_stemming(text):
    """
       Also Borrowed from our preprocessing module.
    """
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    """
        Edited function from our Preprocessing pipeline
        Doesn't remove stopwords as it turns out some stopwords like 'NO' is actually very importan
    
    """
    result = []
    #stopwords = list(gensim.parsing.preprocessing.STOPWORDS)
    #stopwords.pop(stopwords.index('no'))
    for token in gensim.utils.simple_preprocess(text):
        #if token not in set(stopwords):
        result.append(lemmatize_stemming(token))
    return ' '.join(result)

### Can take a few minutes

In [29]:
df['text_preprocessed'] = df['text'].map(preprocess)

In [31]:
df.head()

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap,text_preprocessed
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...","[europop, swedish pop]",0,0,1,0,0,0,0,look at her face it wonder face and it mean so...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...","[europop, swedish pop]",0,0,1,0,0,0,0,take it easi with me pleas touch me gentli lik...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,"[europop, swedish pop]",0,0,1,0,0,0,0,ll never know whi have to go whi have to put u...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...


In [32]:
#Checking what the tokens column look like before removing stopwords
df['text_preprocessed'].head()

0    look at her face it wonder face and it mean so...
1    take it easi with me pleas touch me gentli lik...
2    ll never know whi have to go whi have to put u...
3    make somebodi happi be question of give and ta...
4    make somebodi happi be question of give and ta...
Name: text_preprocessed, dtype: object

In [33]:
#safety
df2 = df

In [34]:
#w+ for whitespace
tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')

#creating a new column called tokens
df['tokens'] = df['text_preprocessed'].map(tokenizer.tokenize)

In [35]:
df

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap,text_preprocessed,tokens
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...","[europop, swedish pop]",0,0,1,0,0,0,0,look at her face it wonder face and it mean so...,"[look, at, her, face, it, wonder, face, and, i..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...","[europop, swedish pop]",0,0,1,0,0,0,0,take it easi with me pleas touch me gentli lik...,"[take, it, easi, with, me, pleas, touch, me, g..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,"[europop, swedish pop]",0,0,1,0,0,0,0,ll never know whi have to go whi have to put u...,"[ll, never, know, whi, have, to, go, whi, have..."
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, be, question, of, give..."
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, be, question, of, give..."
5,ABBA,Burning My Bridges,/a/abba/burning+my+bridges_20003011.html,"Well, you hoot and you holler and you make me ...","[europop, swedish pop]",0,0,1,0,0,0,0,well you hoot and you holler and you make me m...,"[well, you, hoot, and, you, holler, and, you, ..."
6,ABBA,Cassandra,/a/abba/cassandra_20002811.html,Down in the street they're all singing and sho...,"[europop, swedish pop]",0,0,1,0,0,0,0,down in the street they re all sing and shout ...,"[down, in, the, street, they, re, all, sing, a..."
7,ABBA,Chiquitita,/a/abba/chiquitita_20002978.html,"Chiquitita, tell me what's wrong You're enchai...","[europop, swedish pop]",0,0,1,0,0,0,0,chiquitita tell me what wrong you re enchain b...,"[chiquitita, tell, me, what, wrong, you, re, e..."
8,ABBA,Crazy World,/a/abba/crazy+world_20003013.html,"I was out with the morning sun Couldn't sleep,...","[europop, swedish pop]",0,0,1,0,0,0,0,be out with the morn sun couldn sleep so think...,"[be, out, with, the, morn, sun, couldn, sleep,..."
9,ABBA,Crying Over You,/a/abba/crying+over+you_20177611.html,I'm waitin' for you baby I'm sitting all alone...,"[europop, swedish pop]",0,0,1,0,0,0,0,waitin for you babi sit all alon feel so cold ...,"[waitin, for, you, babi, sit, all, alon, feel,..."


### Removing stop words
Reference for more info: https://www.geeksforgeeks.org/removing-stop-words-nltk-python/

In [36]:
#setting stop words

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english')) 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\peter\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [37]:
#removing stop words
df['tokens'] = df['tokens'].apply(lambda x: [item for item in x if item not in stop_words])


In [38]:
df.head()

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap,text_preprocessed,tokens
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...","[europop, swedish pop]",0,0,1,0,0,0,0,look at her face it wonder face and it mean so...,"[look, face, wonder, face, mean, someth, speci..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...","[europop, swedish pop]",0,0,1,0,0,0,0,take it easi with me pleas touch me gentli lik...,"[take, easi, pleas, touch, gentli, like, summe..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,"[europop, swedish pop]",0,0,1,0,0,0,0,ll never know whi have to go whi have to put u...,"[never, know, whi, go, whi, put, lousi, rotten..."
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ..."
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ..."


### Stemming the data - stemming done above in pre-processing

We give the option to either lemmatization or stemming. We ran both and compared our outputs.

In [39]:
df.head()

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap,text_preprocessed,tokens
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...","[europop, swedish pop]",0,0,1,0,0,0,0,look at her face it wonder face and it mean so...,"[look, face, wonder, face, mean, someth, speci..."
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...","[europop, swedish pop]",0,0,1,0,0,0,0,take it easi with me pleas touch me gentli lik...,"[take, easi, pleas, touch, gentli, like, summe..."
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,"[europop, swedish pop]",0,0,1,0,0,0,0,ll never know whi have to go whi have to put u...,"[never, know, whi, go, whi, put, lousi, rotten..."
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ..."
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ..."


In [40]:
#checking out the data
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 51119 entries, 0 to 57595
Data columns (total 14 columns):
artist               51119 non-null object
song                 51119 non-null object
link                 51119 non-null object
text                 51119 non-null object
genres               51119 non-null object
rock                 51119 non-null int32
singer-songwriter    51119 non-null int32
pop                  51119 non-null int32
metal                51119 non-null int32
folk                 51119 non-null int32
country              51119 non-null int32
hip hop / rap        51119 non-null int32
text_preprocessed    51119 non-null object
tokens               51119 non-null object
dtypes: int32(7), object(7)
memory usage: 4.5+ MB


In [44]:
#turning the stems back into untokenized structrure so that we can run count vector on it
df['stem_str'] = df['tokens'].map(lambda lst: ' '.join(lst))

In [45]:
df.head()

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap,text_preprocessed,tokens,stem_str
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...","[europop, swedish pop]",0,0,1,0,0,0,0,look at her face it wonder face and it mean so...,"[look, face, wonder, face, mean, someth, speci...",look face wonder face mean someth special look...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...","[europop, swedish pop]",0,0,1,0,0,0,0,take it easi with me pleas touch me gentli lik...,"[take, easi, pleas, touch, gentli, like, summe...",take easi pleas touch gentli like summer even ...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,"[europop, swedish pop]",0,0,1,0,0,0,0,ll never know whi have to go whi have to put u...,"[never, know, whi, go, whi, put, lousi, rotten...",never know whi go whi put lousi rotten show bo...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ...",make somebodi happi question give take learn s...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ...",make somebodi happi question give take learn s...


### Text Analysis

In [46]:
#Documentation: 
#https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfTransformer.html


# initialise count vectorizer
#Convert a collection of text documents to a matrix of token counts
#This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.
#cv = CountVectorizer()

# generate word counts
#stem_count_vector = cv.fit_transform(df['stem_str'])


In [47]:
#stem_count_vector

In [48]:
EMB_DIM=300
wv_model=Word2Vec(df["tokens"],
             size=EMB_DIM,
             window=5,
             min_count=5, 
             negative=15, 
             iter=10,
             workers=multiprocessing.cpu_count(), 
             #defines the algorithm to use as the model. sg = skip-gram
             #default is CBOW (Continuous bag of words)
             sg = 1   )

In [49]:
# load the model from disk
import pickle
  
# Save the trained model as a pickle string. 
pickle.dump(wv_model, open( "songs_wv_model", "wb" ))

In [50]:
word_vectors=wv_model.wv

### Exploring different types of word similarity

In [51]:
#Exploring similar words
#calling object similar_by_word
word_vectors.similar_by_word('masterplan')

[('deriv', 0.5724190473556519),
 ('inconceiv', 0.5611752271652222),
 ('distantli', 0.5597276091575623),
 ('overreact', 0.5587066411972046),
 ('civilis', 0.5554229021072388),
 ('handbook', 0.5457670092582703),
 ('exhum', 0.5451184511184692),
 ('outlet', 0.5434283018112183),
 ('questionin', 0.5426793098449707),
 ('werent', 0.5421222448348999)]

In [52]:
#vectors.most_similar(positive=['insertwordhere'],negative=['insertwordhere'])

In [53]:
#showing vector of individual words
wv_model["love"]


Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).



array([ 0.01534662, -0.40659776, -0.2139412 ,  0.08655163,  0.15152055,
        0.17011087,  0.17489465,  0.07906845,  0.015125  , -0.02462435,
        0.03014242, -0.07251216,  0.29266268,  0.00408616, -0.05040325,
       -0.24627192, -0.03491923,  0.26175445,  0.09603594, -0.11335214,
        0.1103901 ,  0.07086267, -0.11272094,  0.06263356,  0.17665052,
       -0.0502292 , -0.1511212 , -0.07538251, -0.15206172,  0.04717248,
       -0.03552925, -0.01493204,  0.16897474, -0.1031625 , -0.05465525,
       -0.10721119, -0.1749175 ,  0.16972743, -0.07515916,  0.11222158,
       -0.09508653, -0.1692242 ,  0.02952265, -0.01532608,  0.23945543,
       -0.20369461, -0.11239567, -0.03326408, -0.13304804, -0.2262324 ,
        0.03813289,  0.11879548,  0.00164522,  0.17911875, -0.00906679,
       -0.04189851, -0.22351901,  0.04472911, -0.15934671,  0.04184378,
        0.10553988, -0.05699034, -0.0557311 , -0.02735744,  0.01317129,
        0.09275808, -0.07423598,  0.09693585, -0.1702979 , -0.10

In [54]:
wv_model.similarity('love', 'hate')


Call to deprecated `similarity` (Method will be removed in 4.0.0, use self.wv.similarity() instead).



0.32912347

In [55]:
#save til finished then add to git
#df.to_csv('Processed_Songs_Data.csv')

In [57]:
df.columns

Index(['artist', 'song', 'link', 'text', 'genres', 'rock', 'singer-songwriter',
       'pop', 'metal', 'folk', 'country', 'hip hop / rap', 'text_preprocessed',
       'tokens', 'stem_str'],
      dtype='object')

In [56]:
df.head()

Unnamed: 0,artist,song,link,text,genres,rock,singer-songwriter,pop,metal,folk,country,hip hop / rap,text_preprocessed,tokens,stem_str
0,ABBA,Ahe's My Kind Of Girl,/a/abba/ahes+my+kind+of+girl_20598417.html,"Look at her face, it's a wonderful face And it...","[europop, swedish pop]",0,0,1,0,0,0,0,look at her face it wonder face and it mean so...,"[look, face, wonder, face, mean, someth, speci...",look face wonder face mean someth special look...
1,ABBA,"Andante, Andante",/a/abba/andante+andante_20002708.html,"Take it easy with me, please Touch me gently l...","[europop, swedish pop]",0,0,1,0,0,0,0,take it easi with me pleas touch me gentli lik...,"[take, easi, pleas, touch, gentli, like, summe...",take easi pleas touch gentli like summer even ...
2,ABBA,As Good As New,/a/abba/as+good+as+new_20003033.html,I'll never know why I had to go Why I had to p...,"[europop, swedish pop]",0,0,1,0,0,0,0,ll never know whi have to go whi have to put u...,"[never, know, whi, go, whi, put, lousi, rotten...",never know whi go whi put lousi rotten show bo...
3,ABBA,Bang,/a/abba/bang_20598415.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ...",make somebodi happi question give take learn s...
4,ABBA,Bang-A-Boomerang,/a/abba/bang+a+boomerang_20002668.html,Making somebody happy is a question of give an...,"[europop, swedish pop]",0,0,1,0,0,0,0,make somebodi happi be question of give and ta...,"[make, somebodi, happi, question, give, take, ...",make somebodi happi question give take learn s...


train test split to four arrays; package with https://docs.scipy.org/doc/numpy/reference/generated/numpy.savez.html