# Preprocessing
This notebook executes preprocessing for both twitts and songs.
Lemmatized and raw text adjectives and nouns are extracted from the texts (lyrics) and are saved along with additional information present in the datasets.

In [4]:
import pandas as pd
import numpy as np
import spacy
from tqdm import tqdm_notebook
import nltk
import re
import json
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import patoolib

## Twitts preprocessing

In [2]:
file_names = ['anger-ratings-0to1.train.txt','fear-ratings-0to1.train.txt','joy-ratings-0to1.train.txt','sadness-ratings-0to1.train.txt']
path = 'data/'
data = []
for file_name in file_names:
    file_name = path+file_name
    with open(file_name, encoding="utf8") as chat:
        chat_text = chat.read()
        chat_text = chat_text[:len(chat_text)-1]
    data += re.split(r'\t+|\n+', chat_text)
data = np.array(data)
data =np.reshape(data, (data.shape[0]//4, 4))
df = pd.DataFrame(data=data, columns=["id", "text","sentiment","intensity"])
df.to_csv('data/training.csv',index=False)

In [5]:
df= pd.read_csv('data/training.csv')
nlp = spacy.load('en_core_web_sm')
docs, ids, sentiments, intensities= [], [], [], []
data = tqdm_notebook(list(df.iterrows()))
for i, row in data:
    tokens = []
    tokenizer = RegexpTokenizer(r'\w+')
    strs = tokenizer.tokenize(row['text'])
    filtered_words = [word for word in strs if word not in stopwords.words('english')]
    input_text =" ".join(filtered_words)
    for token in nlp(input_text):
        tokens.append((token.text, token.lemma_, token.pos_))
    docs.append(tokens)
    ids.append([row['id']])
    sentiments.append(row['sentiment'])
    intensities.append(row['intensity'])

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=3613.0), HTML(value='')))




In [6]:
raw = []
lemma = []
for doc in docs:
    sentence_raw = []
    sentence_lemma = []
    for elem in doc:
        if elem[2] in ['NOUN','ADJ']:
            sentence_raw.append(elem[0]) 
            sentence_lemma.append(elem[1])
    raw.append(" ".join(sentence_raw))
    lemma.append(" ".join(sentence_lemma))

In [7]:
df = pd.DataFrame()
df['id'] =ids
df['lemma'] = lemma
df['raw'] = raw
df['sentiment'] = sentiments
df['intensity'] = intensities

In [9]:
df.loc[(df['sentiment'] == 'anger'),'sentiment']=0
df.loc[(df['sentiment'] == 'fear'),'sentiment']=1
df.loc[(df['sentiment'] == 'joy'),'sentiment']=2
df.loc[(df['sentiment'] == 'sadness'),'sentiment']=3
df

Unnamed: 0,id,lemma,raw,sentiment,intensity
0,[10000],heck fridge landlord door angry,heck fridge landlord door angry,0,0.938
1,[10001],driver word vehicle disgusted,driver word vehicle disgusted,0,0.896
2,[10002],parcel pick store address fuming poorcustomers...,parcel pick store address fuming poorcustomers...,0,0.896
3,[10003],butt wipe fire alarm asleep angry upset tired ...,butt wipe fire alarm asleep angry upset tired ...,0,0.896
4,[10004],phone talk rude money acc,phone talk rude money acc,0,0.896
...,...,...,...,...,...
3608,[40781],home happy,home happy,3,0.104
3609,[40782],winter duvet,winter duvet,3,0.104
3610,[40783],sky background purple highlight dull color great,sky background purple highlights dull colors g...,3,0.088
3611,[40784],second artist announcement good bluesfest2017 ...,second artist announcement good bluesfest2017 ...,3,0.083


In [10]:
df.to_csv('data/preprocessed_training.csv',index=False)

## Twitts preprocessing

In [10]:
nltk.download('stopwords')

In [3]:
patoolib.extract_archive("data/lyrics.rar", outdir="data/")

patool: Extracting data/lyrics.rar ...
patool: running "C:\Program Files\WinRAR\rar.EXE" x -- C:\Users\simone.quadrelli\Desktop\Setiment-analysis-of-song-lyrics\data\lyrics.rar
patool:     with cwd=data/
patool: ... data/lyrics.rar extracted to `data/'.


'data/'

In [None]:
df = pd.read_csv('data/lyrics.csv')
df = df.dropna()
nlp = spacy.load('en_core_web_sm')
docs, songs, genres, artists, ids = [],[],[],[],[]
data = tqdm_notebook(list(df.iterrows()))
for i, row in data:
    tokens = []
    tokenizer = RegexpTokenizer(r'\w+')
    strs = tokenizer.tokenize(row['lyrics'])
    filtered_words = [word for word in strs if word not in stopwords.words('english')]
    input_text =" ".join(filtered_words)
    for token in nlp(input_text):
        tokens.append((token.text, token.lemma_, token.pos_))
    docs.append(tokens)
    ids.append([row['index']])
    songs.append(row['song'])
    genres.append(row['genre'])

In [7]:
raw = []
lemma = []
for doc in docs:
    sentence_raw = []
    sentence_lemma = []
    for elem in doc:
        if elem[2] in ['NOUN','ADJ']:
            sentence_raw.append(elem[0]) 
            sentence_lemma.append(elem[1])
    raw.append(" ".join(sentence_raw))
    lemma.append(" ".join(sentence_lemma))

In [8]:
df = pd.DataFrame()
df['id'] =ids
df['lemma'] = lemma
df['raw'] = raw
df['song'] = songs
df['genre'] = genres
df['author'] = df_all['artist']

In [5]:
df.to_csv('data/preprocessed_songs.csv', index=False)