### Setup

In [1]:
! pip install contractions -q

### Load the packages

In [2]:
import re

import contractions
import pandas as pd
from tqdm.notebook import tqdm
import warnings

pd.set_option('display.max_columns', None)
tqdm.pandas()
warnings.filterwarnings('ignore')

In [3]:
data = pd.read_csv('../data/Train.csv')\
        .drop(columns=['Unnamed: 0'])
print(f'Shape of data: {data.shape}')
data.head()

Shape of data: (290183, 5)


Unnamed: 0,text,genre,label,label_model,text_cleaned
0,"It starts with pain, followed by hate\nFueled ...",rock,9,LABEL_9,"It starts with pain, followed by hate\nFueled ..."
1,Freedom!\nAlone again again alone\nPatiently w...,rock,9,LABEL_9,Freedom!\nAlone again again alone\nPatiently w...
2,"Biting the hand that feeds you, lying to the v...",rock,9,LABEL_9,"Biting the hand that feeds you, lying to the v..."
3,You say you know just who I am\nBut you can't ...,rock,9,LABEL_9,You say you know just who I am\nBut you can't ...
4,My heart is beating faster can't control these...,rock,9,LABEL_9,My heart is beating faster can't control these...


### EDA

In [63]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 290183 entries, 0 to 290182
Data columns (total 5 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   text          290148 non-null  object
 1   genre         290183 non-null  object
 2   label         290183 non-null  int64 
 3   label_model   290183 non-null  object
 4   text_cleaned  290147 non-null  object
dtypes: int64(1), object(4)
memory usage: 11.1+ MB


In [64]:
data.isnull().sum()

text            35
genre            0
label            0
label_model      0
text_cleaned    36
dtype: int64

In [65]:
data[['genre', 'label_model']].value_counts()

genre       label_model
rock        LABEL_9        121404
pop         LABEL_7        108714
metal       LABEL_6         20291
jazz        LABEL_5         13545
folk        LABEL_2          8644
indie       LABEL_4          8449
r&b         LABEL_8          2793
hip-hop     LABEL_3          2240
electronic  LABEL_1          2213
country     LABEL_0          1890
Name: count, dtype: int64

In [66]:
print(f'Uncleaned text:\n{"-"*20}\n{data.text[0]}\n')
print(f'Cleaned text:\n{"-"*20}\n{data.text_cleaned[0]}\n{"-"*50}\n')
print(f'Are they similar? - {data.text[0] == data.text_cleaned[0]}')

Uncleaned text:
--------------------
It starts with pain, followed by hate
Fueled by the endless questions no one can answer
A stain covers your heart and tears you apart
Just like a sleeping cancer
I don't believe men are born to be killers
I don't believe the world can be saved
How did you get here and when did it start?
An innocent child with a thorn in his heart
What kind of world do we live in?
Where love is divided by hate
Loosing control of our feelings
We all must be dreaming this life away
In a world so cold
Are you sane, where's the shame?
A moment of time passes by you cannot rewind
Who's to blame and where did it start?
Is there a cure for your sickness
Have you no heart?
I don't believe men are born to be killers
I don't believe the world can't be saved
How did you get here and when did it start?
An innocent child with a thorn in his heart
What kind of world do we live in?
Where love is divided by hate
Selling our soul for no reason
We all must be dreaming this life away
I

In [81]:
data['is_similar'] = data['text'] == data['text_cleaned']
data.is_similar.value_counts(normalize=True)

is_similar
True     0.991664
False    0.008336
Name: proportion, dtype: float64

In [82]:
def text_clean(text):
    text = contractions.fix(text)
    text = re.sub(r'[^\S\n]+', ' ', text)
    text = re.sub(r'[^A-Za-z0-9\n]', ' ', text)
    text = text.lower().strip().strip('\n')
    return text

In [83]:
data_clean = data[data.text.notnull() & data.text_cleaned.notnull()]\
                    .reset_index(drop=True)

data_clean['text_cleaned'] = data_clean['text_cleaned'].progress_apply(lambda text: text_clean(text))
data_clean['num_chars'] = data_clean['text_cleaned'].progress_apply(lambda text: len(text))

data_clean.head()

  0%|          | 0/290147 [00:00<?, ?it/s]

  0%|          | 0/290147 [00:00<?, ?it/s]

Unnamed: 0,text,genre,label,label_model,text_cleaned,is_similar,num_chars
0,"It starts with pain, followed by hate\nFueled ...",rock,9,LABEL_9,it starts with pain followed by hate\nfueled ...,True,1636
1,Freedom!\nAlone again again alone\nPatiently w...,rock,9,LABEL_9,freedom \nalone again again alone\npatiently w...,True,1199
2,"Biting the hand that feeds you, lying to the v...",rock,9,LABEL_9,biting the hand that feeds you lying to the v...,True,768
3,You say you know just who I am\nBut you can't ...,rock,9,LABEL_9,you say you know just who i am\nbut you cannot...,True,780
4,My heart is beating faster can't control these...,rock,9,LABEL_9,my heart is beating faster cannot control thes...,True,1717


In [75]:
data_clean.num_chars.describe()

count    290147.000000
mean       1208.166016
std         777.200981
min           0.000000
25%         722.000000
50%        1036.000000
75%        1468.000000
max       63137.000000
Name: num_chars, dtype: float64

In [55]:
data_clean[data_clean.num_chars == 63137]#.text_cleaned[3414])

Unnamed: 0,text,genre,label,label_model,text_cleaned,is_similar,num_chars
3414,E|--------------------------------------------...,metal,6,LABEL_6,e ...,True,63137


In [77]:
data_clean[data_clean.genre=='metal'].text_cleaned[100]

'a hundred days have made me older \nsince the last time that i saw your pretty face\na thousand lies have made me colder\nand i do not think i can look at this the same\nbut all the miles that separate\nthey disappear now when i am dreaming of your face\ni am here without you baby\nbut you are still on my lonely mind\ni think about you baby\nand i dream about you all the time\ni am here without you baby\nbut you are still with me in my dreams\nand tonight  it is only you and me\nthe miles just keep rolling\nas the people leave their way to say hello\ni have heard this life is overrated\nbut i hope that it\ngets better as we go\ni am here without you baby\nbut you are still on my lonely mind\ni think about you baby\nand i dream about you all the time\ni am here without you baby\nbut you are still with me in my dreams\nand tonight  girl  it is only you and me\neverything i know  and anywhere i go\nit gets hard but it will not take away my love\nand when the last one falls  when it is al