In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import re

In [2]:
def clean_string_formatting(string):
    return string.replace('\n','').replace('\r',' ').strip()

In [3]:
country = pd.read_csv('cleaned_section_headers/country_lyrics_cleaned.csv')
hiphop = pd.read_csv('cleaned_section_headers/hiphop_lyrics_cleaned.csv')
pop = pd.read_csv('cleaned_section_headers/pop_lyrics_cleaned.csv')
rock = pd.read_csv('cleaned_section_headers/rock_lyrics_cleaned.csv')

In [4]:
#combines lyrics of all genres into one list to be input for tf-idf vectorizer
def get_all_lyrics():
    genre_arr = [country, hiphop, pop, rock]
    all_lyric_list = []
    for genre in genre_arr:
        cleaned_lyrics = genre['Lyrics'].map(clean_string_formatting)
        lyric_list = cleaned_lyrics.values.tolist()
        all_lyric_list += lyric_list
    return all_lyric_list
all_lyrics = get_all_lyrics()

In [9]:
#function to vectorize input using tf-idf with svd
def vectorize_lyrics(lyrics, svd_components):
    tfidf_vectorizer = TfidfVectorizer()
    full_tfidf = tfidf_vectorizer.fit_transform(lyrics)
    svd = TruncatedSVD(n_components=svd_components)
    dimension_reduced = svd.fit_transform(full_tfidf)
    return dimension_reduced

In [12]:
tfidf = vectorize_lyrics(all_lyrics, 500)
print(tfidf)

[[ 0.40564561  0.1194689  -0.0343111  ... -0.06337569  0.0313569
  -0.01910922]
 [ 0.19576104 -0.01975288  0.02499373 ...  0.0021029  -0.00601638
  -0.00181569]
 [ 0.35592756  0.03389266 -0.02922241 ... -0.03230092  0.01611221
  -0.02159172]
 ...
 [ 0.48782406  0.20400778 -0.04017088 ...  0.0311991  -0.00160807
  -0.02118098]
 [ 0.24567049 -0.0562395  -0.01684197 ... -0.0077448  -0.02155489
   0.0213618 ]
 [ 0.25877634  0.01369936  0.01670623 ... -0.01135136 -0.02079777
   0.01661971]]
