In [174]:
import os #needed for the API for example (to access environment variables)
from dotenv import load_dotenv #needed to load environmet variables to this project
import tensorflow as tf #for the RNN (recurring neural network)
import keras #for the RNN (recurring neural network)
from keras import layers #for the RNN (recurring neural network)
import numpy as np
import os #to interact with the folders of this project
import time #this will allow us to create a time buffer between each lyrics extraction 
import lyricsgenius as lg #(to work with the genius.com API)
import requests #to request data from Genius API (sending http requests)
import json #to read the extracted info from the API (encoding and decoding the info we got)
import pandas as pd
from requests.exceptions import SSLError #to handle exceptions when fetching data from Genius
import re
import logging #for the logbook I created to record the errors while fetching lyrics from the API genius
import nltk
from nltk.corpus import stopwords #from the nat.lang. tool kit, to choose what words not to consider in the analysis
from collections import Counter # when analysisng the words per song to be able to count inside the .txt the most common words
import string #this library helps us cleaning the lyrics to account only for characters based words and keep "400$" for example out

In [7]:
pd.set_option('display.max_columns', None)

### Load the songs_metadata_df:

In [205]:
songs_metadata_df = pd.read_csv("data/songs_metadata.csv")

## A formula to discard all the songs that we did not manage to extract lyrics to:

Reminding that: a big portion of these were not extracted (on purpose) for being live versions or remixes which do not add value to our analysis

In [206]:
# the songs are in our data folder:
data_folder = 'data'

def file_exists(row):
    track_id = row['genius_track_id']
    file_name = f'__{track_id}_{row["title"]}_{row["artist"]}.txt'
    file_path = os.path.join(data_folder, file_name)
    return os.path.exists(file_path)

songs_metadata_df['file_exists'] = songs_metadata_df.apply(file_exists, axis=1)
songs_metadata_df = songs_metadata_df[ (songs_metadata_df['file_exists'] == True)]
songs_metadata_df.reset_index(drop=True, inplace=True)

songs_metadata_df.head()

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist,file_exists
0,1.5,i am﹥i was,2018,['Offset'],"['Nils', 'Wheezy']","['Nils', 'Wheezy', 'Offset', '21 Savage']",4155501,477314,21 Savage,True
1,H2O,Free Guwop EP,2015,[],"['Sonny Digital', 'Zaytoven']","['Zaytoven', 'Sonny Digital', '21 Savage']",2251254,134590,21 Savage,True
2,​gun smoke,i am﹥i was,2018,[],"['Freek van Workum', 'ItsNicklus', 'Kid Hazel']","['Freek van Workum', 'ItsNicklus', 'Kid Hazel'...",4155500,477314,21 Savage,True
3,​good day,i am﹥i was,2018,"['Project Pat', 'ScHoolboy Q']","['Cardo', '30 Roc']","['21 Savage', 'ScHoolboy Q', 'Project Pat', 'C...",4155511,477314,21 Savage,True
4,Gang,The Slaughter Tape,2015,[],['F12'],"['F12', '21 Savage']",2442069,133503,21 Savage,True


Exporting the DF

In [207]:
# csv_file_path = "data/songs_metadata_lyrics_existing_songs.csv"
# songs_metadata_df.to_csv(csv_file_path, index=False)

## I am further interested in assessing the vocabulary throughout the different hip hop eras. Let's add a column for this

what are the different hip hop eras?
Even thought there's not a *clear* consent, there seems to be and widely accepted fragmentation:
https://www.reddit.com/r/hiphop101/comments/rydtvv/how_would_you_separatedefine_the_different_eras/

In [208]:
songs_metadata_df['hip_hop_era'] = ""
def what_is_the_hh_era(release_date):
    if release_date < 1980:
        return "The Setup"
    elif release_date < 1985:
        return "Old School Hip Hop"
    elif release_date < 1998:
        return "Golden Age of Hip Hop"
    elif release_date < 2002:
        return "Shiny Suit/Bling Era Hip Hop"
    elif release_date < 2008:
        return "Crunk/Snap Hip Hop"
    elif release_date < 2015:
        return "Blog Era Hip Hop"
    else:
        return "Melodic/Mumble Era Hip Hop"

songs_metadata_df['hip_hop_era'] = songs_metadata_df['release_date'].apply(what_is_the_hh_era)
songs_metadata_df.head()


Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist,file_exists,hip_hop_era
0,1.5,i am﹥i was,2018,['Offset'],"['Nils', 'Wheezy']","['Nils', 'Wheezy', 'Offset', '21 Savage']",4155501,477314,21 Savage,True,Melodic/Mumble Era Hip Hop
1,H2O,Free Guwop EP,2015,[],"['Sonny Digital', 'Zaytoven']","['Zaytoven', 'Sonny Digital', '21 Savage']",2251254,134590,21 Savage,True,Melodic/Mumble Era Hip Hop
2,​gun smoke,i am﹥i was,2018,[],"['Freek van Workum', 'ItsNicklus', 'Kid Hazel']","['Freek van Workum', 'ItsNicklus', 'Kid Hazel'...",4155500,477314,21 Savage,True,Melodic/Mumble Era Hip Hop
3,​good day,i am﹥i was,2018,"['Project Pat', 'ScHoolboy Q']","['Cardo', '30 Roc']","['21 Savage', 'ScHoolboy Q', 'Project Pat', 'C...",4155511,477314,21 Savage,True,Melodic/Mumble Era Hip Hop
4,Gang,The Slaughter Tape,2015,[],['F12'],"['F12', '21 Savage']",2442069,133503,21 Savage,True,Melodic/Mumble Era Hip Hop


## Let's count the words for every song and add them to our dataframe:

testing example

In [44]:
file_path = "data/__4155501_1.5_21 Savage.txt"

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        word_count = len(content.split())
        print(f"The number of words in the file '{file_path}' is: {word_count}")
except FileNotFoundError:
    print(f"The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

The number of words in the file 'data/__4155501_1.5_21 Savage.txt' is: 478


#### Actual Code

In [209]:
data_folder = "data"

for index, row in songs_metadata_df.iterrows():
    genius_track_id = row['genius_track_id']
    title = row['title']
    artist = row['artist']
    
    # Construct the file name
    file_name = f"__{genius_track_id}_{title}_{artist}.txt"
    file_path = os.path.join(data_folder, file_name)
    
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            word_count = len(content.split())
            
            songs_metadata_df.at[index, 'words_count'] = word_count
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred for file {file_path}: {e}")

songs_metadata_df.tail()


Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist,file_exists,hip_hop_era,words_count
0,1.5,i am﹥i was,2018,['Offset'],"['Nils', 'Wheezy']","['Nils', 'Wheezy', 'Offset', '21 Savage']",4155501,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,478.0
1,H2O,Free Guwop EP,2015,[],"['Sonny Digital', 'Zaytoven']","['Zaytoven', 'Sonny Digital', '21 Savage']",2251254,134590,21 Savage,True,Melodic/Mumble Era Hip Hop,434.0
2,​gun smoke,i am﹥i was,2018,[],"['Freek van Workum', 'ItsNicklus', 'Kid Hazel']","['Freek van Workum', 'ItsNicklus', 'Kid Hazel'...",4155500,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,526.0
3,​good day,i am﹥i was,2018,"['Project Pat', 'ScHoolboy Q']","['Cardo', '30 Roc']","['21 Savage', 'ScHoolboy Q', 'Project Pat', 'C...",4155511,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,628.0
4,Gang,The Slaughter Tape,2015,[],['F12'],"['F12', '21 Savage']",2442069,133503,21 Savage,True,Melodic/Mumble Era Hip Hop,539.0


In [212]:
songs_metadata_df.shape

(10642, 12)

## Let's filter out the rap songs that have an irregular number of words:

In [213]:
songs_metadata_df = songs_metadata_df[ (songs_metadata_df['words_count'] >= 150) & (songs_metadata_df['words_count'] <= 650)]
print(songs_metadata_df.shape)
songs_metadata_df.head()

(6950, 12)


Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist,file_exists,hip_hop_era,words_count
0,1.5,i am﹥i was,2018,['Offset'],"['Nils', 'Wheezy']","['Nils', 'Wheezy', 'Offset', '21 Savage']",4155501,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,478.0
1,H2O,Free Guwop EP,2015,[],"['Sonny Digital', 'Zaytoven']","['Zaytoven', 'Sonny Digital', '21 Savage']",2251254,134590,21 Savage,True,Melodic/Mumble Era Hip Hop,434.0
2,​gun smoke,i am﹥i was,2018,[],"['Freek van Workum', 'ItsNicklus', 'Kid Hazel']","['Freek van Workum', 'ItsNicklus', 'Kid Hazel'...",4155500,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,526.0
3,​good day,i am﹥i was,2018,"['Project Pat', 'ScHoolboy Q']","['Cardo', '30 Roc']","['21 Savage', 'ScHoolboy Q', 'Project Pat', 'C...",4155511,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,628.0
4,Gang,The Slaughter Tape,2015,[],['F12'],"['F12', '21 Savage']",2442069,133503,21 Savage,True,Melodic/Mumble Era Hip Hop,539.0


In [47]:
print(songs_metadata_df["words_count"].max())
print(songs_metadata_df["words_count"].min())

650.0
150.0


How to decide what songs to leave out?
If we look at research on average number of words per rap song we can see that there's not an agreement on an established number, nor is it scientific. The best approach I found was actually this one from a Redditor:

![From redit: https://www.reddit.com/r/dataisbeautiful/comments/8j1r7b/words_per_song_for_rap_rock_and_country_music_oc/](./images/Words_per_song_RAP_-_reddit.png)

as we can see, the spread of the words count follows an approximate normal distribution with average 450. Let's (visually, as no more details are provided) set up a std dev of 300 account for all the lyrics within the range 150-650 words.
Data source: https://www.reddit.com/r/dataisbeautiful/comments/8j1r7b/words_per_song_for_rap_rock_and_country_music_oc/

## Let's count UNIQUE the words for every song and add them to our dataframe:

In [214]:
data_folder = "data"

songs_metadata_df['count_unique_words'] = 0

"""
we want to count the unique words and add them to a new column but we also want to keep the special characters away.
Somehow in previous tests "(hello" was a word and "hello" was another word.
"""

# Characters to exclude from words
exclude_chars = set('"(),\:?.!-"')  # Once again, we need to sanitize this as we've done with the artists' names

for index, row in songs_metadata_df.iterrows():
    genius_track_id = row['genius_track_id']
    title = row['title']
    artist = row['artist']

    file_name = f"__{genius_track_id}_{title}_{artist}.txt"
    file_path = os.path.join(data_folder, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()
            #spec chars off:
            sanitized_content = ''.join(char.lower() if char.lower() not in exclude_chars else ' ' for char in content)

            unique_word_count = len(set(sanitized_content.split()))

            #updating the col:
            songs_metadata_df.at[index, 'count_unique_words'] = unique_word_count
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred for file {file_path}: {e}")

songs_metadata_df.tail(10)


Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist,file_exists,hip_hop_era,words_count,count_unique_words
10629,Gucci Grocery Bag,BUSINESS IS BUSINESS,2023,[],['Aviator Keyyz'],"['Young Thug', 'Aviator Keyyz']",9263384,859407,Young Thug,True,Melodic/Mumble Era Hip Hop,589.0,179
10631,Haiti Slang,I Came from Nothing 2,2011,[],['DJ Swamp Izzo'],['Young Thug'],656901,88240,Young Thug,True,Blog Era Hip Hop,522.0,242
10632,Halftime,Barter 6,2015,[],['Kip Hilson'],"['Young Thug', 'Kip Hilson']",731048,122065,Young Thug,True,Melodic/Mumble Era Hip Hop,612.0,273
10633,Harambe,JEFFERY,2016,[],['Billboard Hitmakers'],"['Young Thug', 'Johnny Bravo', 'E-Dubb Da Mons...",2844780,162208,Young Thug,True,Melodic/Mumble Era Hip Hop,589.0,207
10634,Hate The Game,Punk,2021,[],"['Louis Bell', '\u200bwatt']","['\u200bwatt', 'Louis Bell', 'Young Thug']",6759270,764670,Young Thug,True,Melodic/Mumble Era Hip Hop,478.0,157
10635,Hellcat Kenny,BUSINESS IS BUSINESS,2023,['Lil Uzi Vert'],"['F1LTHY', 'Jonah Abraham']","['Young Thug', 'Lil Uzi Vert', 'F1LTHY', 'Jona...",9263016,859407,Young Thug,True,Melodic/Mumble Era Hip Hop,512.0,231
10637,"Hey, I",Slime Season 2,2015,[],['London on da Track'],"['London on da Track', 'Young Thug']",1912273,134201,Young Thug,True,Melodic/Mumble Era Hip Hop,461.0,182
10638,High,On the Rvn,2018,['Elton John'],['Stelios Phili'],"['Charlie Handsome', 'Rex Kudo', 'Stelios Phil...",3911420,460279,Young Thug,True,Melodic/Mumble Era Hip Hop,485.0,203
10639,HollyHood,I Came from Nothing 3,2012,"['Aston Martin Phi', 'Skypad War']",['DJ Swamp Izzo'],"['Aston Martin Phi', 'Skypad War', 'Young Thug']",2165949,18957,Young Thug,True,Blog Era Hip Hop,523.0,187
10640,Global Access,BUSINESS IS BUSINESS,2023,['Nate Ruess'],"['Metro Boomin', 'Peter Lee Johnson', 'Wheezy'...","['Young Thug', 'Nate Ruess', 'Metro Boomin', '...",9263046,859407,Young Thug,True,Melodic/Mumble Era Hip Hop,487.0,201


#### I am now interested in knowing how wide is the rapper's vocabulary but I don't want to take into account "stop words". NKPK has got some features that will help us filter these out

 https://en.wikipedia.org/wiki/Stop_word

So basically stop words (or negative dictionaries) are lists of words that, in NLP, get filtered out for not being considered to add value to the analysis. There's not a consensus on the amount or what types of words to remove and these lists vary greatly in size (from 12-200 words per list). 

## Let's sanitize word counts, remove the "stop words" and find the top 5 most repeated words across each song and their respective count:

In [215]:
nltk.download('stopwords')
# Define a list of stop words
stop_words = set(stopwords.words('english'))
#the imported library only gives us so much. After the first couple tries, I saw some other stuff I don't want there
additional_stop_words = ["*","get","ft","wu","niggas","&","na","'em","la","ain't","like","got","I'm", "i'm","gg", "gon'", "i'll", "da", "p", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "qb"]
stop_words.update(additional_stop_words)
print(stop_words)

{'above', 'here', 'j', 'not', 'your', 'their', 'yours', 'during', "didn't", 'between', 'am', 'is', '*', 'you', 'p', "shan't", 'mightn', "you'll", 'were', 'a', 'didn', 'will', 'itself', 'got', "I'm", 'c', 'f', 'have', 'wasn', "aren't", 'm', 'those', "wouldn't", 'they', 'hadn', 't', 'and', 'its', 'of', 'which', 'before', 'k', "weren't", 'or', 'with', 'only', "won't", 'w', 'while', 'all', 'wu', 'other', "i'll", 'i', 'just', 'couldn', 'qb', 'has', "you're", 'off', 'over', 'how', 'so', 'can', 'n', 'myself', 'r', 'our', 'b', 'needn', 'no', 'more', 'the', 'he', 'are', 'get', 'do', 'below', 'why', 'h', 'out', 'nor', 'niggas', 'should', "needn't", "'em", 'own', 'haven', 'this', 'had', 'being', 'into', 'himself', "she's", 'where', 'an', 'like', 'ourselves', 'ma', 'by', 'both', 'hasn', 'further', 'what', 'on', 'doesn', 'ft', 'herself', 'from', 'hers', 'him', 'in', 'q', 'was', 'her', '&', 'v', "ain't", 'mustn', 'about', 'if', 're', 'me', 'such', 'that', "don't", "mustn't", "should've", 'as', 'whom

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mocid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Following code essentially creates the placeholders for the top 5 most frequently used words, computes them and fills in the respective counts while sanitizing words for extra characters

In [219]:
data_folder = "data"
songs_metadata_df['most_repeated_word'] = "" 
songs_metadata_df['most_repeated_word_count'] = 0  
songs_metadata_df['top2_word'] = ""
songs_metadata_df['top2_word_count'] = 0  
songs_metadata_df['top3_word'] = "" 
songs_metadata_df['top3_word_count'] = 0  
songs_metadata_df['top4_word'] = ""  
songs_metadata_df['top4_word_count'] = 0  
songs_metadata_df['top5_word'] = ""
songs_metadata_df['top5_word_count'] = 0 

nltk.download('stopwords')
# Define a list of stop words
stop_words = set(stopwords.words('english'))
#the imported library only gives us so much. After the first couple tries, I saw some other stuff I don't want there
additional_stop_words = ["*","get","ft","wu","niggas","&","na","'em","la","ain't","like","got","I'm", "i'm","gg", "gon'", "i'll", "da", "p", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "qb"]
stop_words.update(additional_stop_words)

# Characters to exclude from words
exclude_chars = set('"(),\:?.!-"')  # Once again, we need to sanitize this as we've done with the artists' names


for index, row in songs_metadata_df.iterrows(): 
    genius_track_id = row['genius_track_id']
    title = row['title']
    artist = row['artist']

    # Constructing the file name
    file_name = f"__{genius_track_id}_{title}_{artist}.txt"
    file_path = os.path.join(data_folder, file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

            #splitting the lyrics into words, excluding certain characters, tokenizing and splitting are here seen as synonyms
            words = ''.join(char.lower() if char.lower() not in exclude_chars else ' ' for char in content).split()

            #removing the stop words and making them all lower case because Love, LOVE and love are the same (ref. Kendrick Lamar DAMN)
            filtered_words = [word for word in words if word not in stop_words]

            # Count the occurrences of each word
            word_counts = Counter(filtered_words)

            # Get the most repeated words and their counts
            most_repeated_words = word_counts.most_common(5)

            # Update the columns
            songs_metadata_df.at[index, 'most_repeated_word'] = most_repeated_words[0][0]
            songs_metadata_df.at[index, 'most_repeated_word_count'] = most_repeated_words[0][1]
            for i in range(1, 5):
                col_name_word = f'top{i + 1}_word'
                col_name_count = f'top{i + 1}_word_count'
                songs_metadata_df.at[index, col_name_word] = most_repeated_words[i][0]
                songs_metadata_df.at[index, col_name_count] = most_repeated_words[i][1]

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred for file {file_path}: {e}")

    # Add print statements to debug
    print(f"Processed row {index}: {file_path}")

        
# Display the updated DataFrame for the first 50 rows
songs_metadata_df.tail(50)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mocid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processed row 0: data\__4155501_1.5_21 Savage.txt
Processed row 1: data\__2251254_H2O_21 Savage.txt
Processed row 2: data\__4155500_​gun smoke_21 Savage.txt
Processed row 3: data\__4155511_​good day_21 Savage.txt
Processed row 4: data\__2442069_Gang_21 Savage.txt
Processed row 5: data\__2293672_Fuckin Niggaz Bitches_21 Savage.txt
Processed row 6: data\__2390679_Front Door_21 Savage.txt
Processed row 7: data\__2459294_Free Gucci_21 Savage.txt
Processed row 8: data\__3147473_Famous_21 Savage.txt
Processed row 9: data\__3147489_FaceTime_21 Savage.txt
Processed row 10: data\__2347009_Drip_21 Savage.txt
Processed row 11: data\__2347662_Drinkin and Driving_21 Savage.txt
Processed row 12: data\__2384389_Dirty K_21 Savage.txt
Processed row 13: data\__5552430_Dirty_21 Savage.txt
Processed row 14: data\__2444416_Deserve_21 Savage.txt
Processed row 15: data\__2842219_Heart So Cold_21 Savage.txt
Processed row 16: data\__3147525_Dead People_21 Savage.txt
Processed row 18: data\__2421000_Bitch Nigga

Processed row 373: data\__7007_Call Me_50 Cent.txt
Processed row 377: data\__6968_Hold Me Down_50 Cent.txt
Processed row 378: data\__379185_Hold On_50 Cent.txt
Processed row 379: data\__20778_Hollow Thru Him_50 Cent.txt
Processed row 381: data\__404295_Hustler_50 Cent.txt
Processed row 382: data\__7021_If Dead Men Could Talk_50 Cent.txt
Processed row 385: data\__6967_I Got Swag_50 Cent.txt
Processed row 386: data\__7109_I Gotta Win_50 Cent.txt
Processed row 387: data\__61243_I Just Wanna_50 Cent.txt
Processed row 388: data\__7110_I Line Niggas_50 Cent.txt
Processed row 389: data\__6966_Do You Think About Me_50 Cent.txt
Processed row 390: data\__233_In da Club_50 Cent.txt
Processed row 391: data\__2830_In My Hood_50 Cent.txt
Processed row 394: data\__413133_Everytime I Come Around_50 Cent.txt
Processed row 395: data\__6977_Fire_50 Cent.txt
Processed row 397: data\__441065_Flip on You_50 Cent.txt
Processed row 398: data\__6976_Follow My Lead_50 Cent.txt
Processed row 400: data\__6975_Ful

Processed row 695: data\__2829056_Seismic Waves_Atmosphere.txt
Processed row 696: data\__8446641_Sculpting With Fire_Atmosphere.txt
Processed row 697: data\__2459350_Salma Hayek_Atmosphere.txt
Processed row 700: data\__2651750_Ringo_Atmosphere.txt
Processed row 701: data\__7004866_Skull_Atmosphere.txt
Processed row 702: data\__8572_Scapegoat_Atmosphere.txt
Processed row 703: data\__5989820_Sleep Apnea_Atmosphere.txt
Processed row 705: data\__7004861_Sleepless_Atmosphere.txt
Processed row 706: data\__5678_Smart Went Crazy_Atmosphere.txt
Processed row 707: data\__7004856_Something_Atmosphere.txt
Processed row 708: data\__5095721_Son of Abyss_Atmosphere.txt
Processed row 710: data\__5989813_Space Is Safe_Atmosphere.txt
Processed row 711: data\__3916717_Specificity_Atmosphere.txt
Processed row 712: data\__417317_Star Shaped Heart_Atmosphere.txt
Processed row 713: data\__8475_Reflections_Atmosphere.txt
Processed row 716: data\__7004867_Nekst_Atmosphere.txt
Processed row 717: data\__5095713_

Processed row 939: data\__3079862_Chi Chi_Azealia Banks.txt
Processed row 940: data\__5236210_DILEMMA_Azealia Banks.txt
Processed row 941: data\__3120607_Chaos and Glory_Azealia Banks.txt
Processed row 942: data\__7180874_​pink panties_Baby Keem.txt
Processed row 943: data\__6243598_​patience interlude_Baby Keem.txt
Processed row 944: data\__4703566_ORANGE SODA_Baby Keem.txt
Processed row 945: data\__5627423_Options_Baby Keem.txt
Processed row 946: data\__4060553_Miss Charlotte_Baby Keem.txt
Processed row 947: data\__4621018_NOT MY BRO_Baby Keem.txt
Processed row 948: data\__4621015_MY EX_Baby Keem.txt
Processed row 949: data\__4621011_MOSHPIT_Baby Keem.txt
Processed row 950: data\__4060558_Monsters Dot Com_Baby Keem.txt
Processed row 951: data\__4060560_Register_Baby Keem.txt
Processed row 952: data\__4060556_Opinions_Baby Keem.txt
Processed row 953: data\__4621013_ROCKSTAR P_Baby Keem.txt
Processed row 954: data\__4620986_STATS_Baby Keem.txt
Processed row 955: data\__7175432_​scapego

Processed row 1307: data\__146915_Lost_Chance the Rapper.txt
Processed row 1308: data\__71140_Long Time Part 2_Chance the Rapper.txt
Processed row 1309: data\__2472237_Juke Jam_Chance the Rapper.txt
Processed row 1310: data\__72705_Juke Juke_Chance the Rapper.txt
Processed row 1311: data\__113663_Juice_Chance the Rapper.txt
Processed row 1312: data\__5499590_Instagram Song 6_Chance the Rapper.txt
Processed row 1313: data\__66798_Long Time_Chance the Rapper.txt
Processed row 1314: data\__4730281_The Big Day_Chance the Rapper.txt
Processed row 1315: data\__5499569_Instagram Song 5_Chance the Rapper.txt
Processed row 1318: data\__72700_Windows_Chance the Rapper.txt
Processed row 1320: data\__81730_U Got Me Fucked Up_Chance the Rapper.txt
Processed row 1321: data\__4730369_Town on the Hill_Chance the Rapper.txt
Processed row 1324: data\__4662042_Sun Come Down_Chance the Rapper.txt
Processed row 1325: data\__2472234_Summer Friends_Chance the Rapper.txt
Processed row 1329: data\__2472248_Sam

Processed row 1551: data\__6226881_Show Out_Chief Keef.txt
Processed row 1552: data\__2303323_Shorties_Chief Keef.txt
Processed row 1553: data\__2946143_Short_Chief Keef.txt
Processed row 1554: data\__396091_Shooters_Chief Keef.txt
Processed row 1555: data\__443378_Shifu_Chief Keef.txt
Processed row 1556: data\__2475460_She Like_Chief Keef.txt
Processed row 1557: data\__3598595_She a Freak_Chief Keef.txt
Processed row 1558: data\__738238_Sleepy_Chief Keef.txt
Processed row 1559: data\__2947798_Stand Down_Chief Keef.txt
Processed row 1560: data\__561503_Stupid_Chief Keef.txt
Processed row 1561: data\__2317207_Told Ya_Chief Keef.txt
Processed row 1562: data\__4026771_To Do With Me_Chief Keef.txt
Processed row 1563: data\__2378153_Time Up_Chief Keef.txt
Processed row 1564: data\__3294149_Ticket_Chief Keef.txt
Processed row 1565: data\__699532_Three Zero Zero_Chief Keef.txt
Processed row 1566: data\__2419859_Thotty Party_Chief Keef.txt
Processed row 1567: data\__252925_Thots Gone Krazy_Chi

Processed row 1784: data\__3106990_Going Home_Chief Keef.txt
Processed row 1786: data\__2354065_Get Your Mind Right_Chief Keef.txt
Processed row 1787: data\__2922429_Getting Dough_Chief Keef.txt
Processed row 1788: data\__2270230_Go Harder_Chief Keef.txt
Processed row 1789: data\__104791_Girl You Know_Chief Keef.txt
Processed row 1790: data\__4055104_Glatt_Chief Keef.txt
Processed row 1791: data\__3817574_Get This Money_Chief Keef.txt
Processed row 1792: data\__5004381_Glo Gang Arena_Chief Keef.txt
Processed row 1793: data\__2308878_Gloin_Chief Keef.txt
Processed row 1795: data\__3294150_Glory Bridge_Chief Keef.txt
Processed row 1796: data\__4026770_Glory Girl_Chief Keef.txt
Processed row 1797: data\__3020833_GLO_Chief Keef.txt
Processed row 1798: data\__58040_Outside_Childish Gambino.txt
Processed row 1799: data\__5787_Phat People_Childish Gambino.txt
Processed row 1801: data\__79125_One Up_Childish Gambino.txt
Processed row 1802: data\__530339_Poke_Childish Gambino.txt
Processed row 

Processed row 2073: data\__7406776_Sinister_Cordae.txt
Processed row 2074: data\__2937330_Never_Cordae.txt
Processed row 2079: data\__4144353_Nervous_Cordae.txt
Processed row 2080: data\__4144513_World Look At Me_Cordae.txt
Processed row 2081: data\__4144501_Wonders_Cordae.txt
Processed row 2082: data\__4708343_Wintertime_Cordae.txt
Processed row 2083: data\__5674586_Where Am I Headed_Cordae.txt
Processed row 2084: data\__7551921_Westlake High_Cordae.txt
Processed row 2085: data\__4710613_Way Back Home_Cordae.txt
Processed row 2086: data\__7579593_Want from Me_Cordae.txt
Processed row 2087: data\__3710307_Tough Decisions_Cordae.txt
Processed row 2088: data\__7579596_Today_Cordae.txt
Processed row 2089: data\__4710612_Thousand Words_Cordae.txt
Processed row 2090: data\__2937448_The Jungle_Cordae.txt
Processed row 2091: data\__4708344_Thanksgiving_Cordae.txt
Processed row 2092: data\__4144514_Text Back_Cordae.txt
Processed row 2093: data\__4144506_Sucker For Love_Cordae.txt
Processed row

Processed row 2438: data\__54441_Die Like a Rockstar_Danny Brown.txt
Processed row 2440: data\__4828209_Dirty Laundry_Danny Brown.txt
Processed row 2441: data\__54992_DNA_Danny Brown.txt
Processed row 2442: data\__9630080_Dark Sword Angel_Danny Brown.txt
Processed row 2443: data\__230517_Dope Fiend Rental_Danny Brown.txt
Processed row 2444: data\__68557_Cartier_Danny Brown.txt
Processed row 2446: data\__55043_30_Danny Brown.txt
Processed row 2448: data\__96094_8 Mile_Danny Brown.txt
Processed row 2449: data\__55034_Adderall Admiral_Danny Brown.txt
Processed row 2450: data\__55031_Bruiser Brigade_Danny Brown.txt
Processed row 2451: data\__103623_Bag Back_Danny Brown.txt
Processed row 2452: data\__9633345_Bass Jam_Danny Brown.txt
Processed row 2453: data\__4843772_Belly of The Beast_Danny Brown.txt
Processed row 2454: data\__4635370_Best Life_Danny Brown.txt
Processed row 2455: data\__269739_Black_Danny Brown.txt
Processed row 2456: data\__54699_Blunt After Blunt_Danny Brown.txt
Processe

Processed row 2805: data\__5433444_Deep Pockets_Drake.txt
Processed row 2807: data\__9592328_Daylight_Drake.txt
Processed row 2809: data\__58497_Crew Love_Drake.txt
Processed row 2810: data\__2464123_Controlla_Drake.txt
Processed row 2812: data\__217268_Connect_Drake.txt
Processed row 2813: data\__703732_Company_Drake.txt
Processed row 2815: data\__217280_Come Thru_Drake.txt
Processed row 2816: data\__57363_Come Real_Drake.txt
Processed row 2817: data\__1217_Comeback Season_Drake.txt
Processed row 2821: data\__815_Congratulations_Drake.txt
Processed row 2822: data\__5310327_Chicago Freestyle_Drake.txt
Processed row 2825: data\__9592354_Fear Of Heights_Drake.txt
Processed row 2827: data\__8110883_Falling Back_Drake.txt
Processed row 2828: data\__2890553_Fake Love_Drake.txt
Processed row 2829: data\__2448795_Faithful_Drake.txt
Processed row 2831: data\__3288_Faded_Drake.txt
Processed row 2833: data\__703741_Energy_Drake.txt
Processed row 2834: data\__3797175_Elevate_Drake.txt
Processed r

Processed row 3344: data\__63746_No Matter What_Future.txt
Processed row 3346: data\__2408568_No Charge_Future.txt
Processed row 3347: data\__71606_Parachute_Future.txt
Processed row 3348: data\__66401_Space Cadets_Future.txt
Processed row 3349: data\__2214964_Slave Master_Future.txt
Processed row 3350: data\__172303_We Made Our Own_Future.txt
Processed row 3351: data\__7917600_WE JUS WANNA GET HIGH_Future.txt
Processed row 3355: data\__7691326_VOODOO_Future.txt
Processed row 3357: data\__2999444_Use Me_Future.txt
Processed row 3358: data\__5562544_Up the River_Future.txt
Processed row 3359: data\__1141409_Upper Echelon_Future.txt
Processed row 3361: data\__58542_Unconditional Love_Future.txt
Processed row 3362: data\__5392349_Tycoon_Future.txt
Processed row 3363: data\__71649_Turn On the Lights_Future.txt
Processed row 3365: data\__405189_T-Shirt_Future.txt
Processed row 3366: data\__71642_Truth Gonna Hurt You_Future.txt
Processed row 3367: data\__2859956_Used to This_Future.txt
Proce

Processed row 3706: data\__27_Big Girl_Ghostface Killah.txt
Processed row 3707: data\__21150_Biscuits_Ghostface Killah.txt
Processed row 3710: data\__3355_Faster Blade_Ghostface Killah.txt
Processed row 3711: data\__51553_Ice_Ghostface Killah.txt
Processed row 3712: data\__2709_In tha Park_Ghostface Killah.txt
Processed row 3714: data\__2817977_Intro_Ghostface Killah.txt
Processed row 3715: data\__3901671_Introduction_Ghostface Killah.txt
Processed row 3717: data\__4701_How You Like Me Baby_Ghostface Killah.txt
Processed row 3720: data\__21122_Josephine_Ghostface Killah.txt
Processed row 3722: data\__21046_Killa Lipstick_Ghostface Killah.txt
Processed row 3724: data\__607774_Emergency Procedure_Ghostface Killah.txt
Processed row 3725: data\__21152_Kunta Fly Shit_Ghostface Killah.txt
Processed row 3726: data\__3901680_I Think I Saw a Ghost_Ghostface Killah.txt
Processed row 3727: data\__598318_Homicide_Ghostface Killah.txt
Processed row 3728: data\__21094_Fish_Ghostface Killah.txt
Proce

Processed row 3940: data\__5725531_Nasty_Gucci Mane.txt
Processed row 3941: data\__1316129_My Year_Gucci Mane.txt
Processed row 3943: data\__2462880_My Rims Dancin_Gucci Mane.txt
Processed row 3944: data\__690670_My Lil Gee_Gucci Mane.txt
Processed row 3945: data\__49203_My Kitchen_Gucci Mane.txt
Processed row 3947: data\__21913_My Chain_Gucci Mane.txt
Processed row 3948: data\__703693_My All_Gucci Mane.txt
Processed row 3950: data\__3362000_Mall_Gucci Mane.txt
Processed row 3953: data\__119938_Miracle_Gucci Mane.txt
Processed row 3955: data\__282612_Mention Me_Gucci Mane.txt
Processed row 3957: data\__83831_Missing_Gucci Mane.txt
Processed row 3958: data\__3201622_Members Only_Gucci Mane.txt
Processed row 3959: data\__651966_Medication_Gucci Mane.txt
Processed row 3960: data\__4557313_Mean That Shit_Gucci Mane.txt
Processed row 3963: data\__9345253_Married with Millions_Gucci Mane.txt
Processed row 3964: data\__6058400_Meeting_Gucci Mane.txt
Processed row 3966: data\__652943_Mob Shit_

Processed row 4262: data\__136248_Crazy Things_Gucci Mane.txt
Processed row 4263: data\__4161243_Crowd In Rage_Gucci Mane.txt
Processed row 4264: data\__2427037_Crush_Gucci Mane.txt
Processed row 4265: data\__4557212_Do Too Much_Gucci Mane.txt
Processed row 4268: data\__651957_Dope Money_Gucci Mane.txt
Processed row 4269: data\__212536_Dopeman_Gucci Mane.txt
Processed row 4270: data\__333353_Double_Gucci Mane.txt
Processed row 4271: data\__476889_Dope Lovve_Gucci Mane.txt
Processed row 4273: data\__8987633_Dopeboy Freestyle_Gucci Mane.txt
Processed row 4274: data\__2369429_Dope Love_Gucci Mane.txt
Processed row 4276: data\__695973_Down on That_Gucci Mane.txt
Processed row 4277: data\__513666_Down & Out_Gucci Mane.txt
Processed row 4278: data\__2427041_Dumb Fine_Gucci Mane.txt
Processed row 4279: data\__5112372_Drummer_Gucci Mane.txt
Processed row 4280: data\__716063_Drummaguwopuhhh_Gucci Mane.txt
Processed row 4281: data\__476381_Drugs Like You_Gucci Mane.txt
Processed row 4282: data\_

Processed row 4611: data\__522395_Guinea Pig_Gucci Mane.txt
Processed row 4612: data\__51407_Gucci the Eskimo_Gucci Mane.txt
Processed row 4614: data\__6917694_Gucci Coming 4 You_Gucci Mane.txt
Processed row 4615: data\__51991_Guilty_Gucci Mane.txt
Processed row 4616: data\__50913_Gucci 2 Time_Gucci Mane.txt
Processed row 4618: data\__741587_Go-Go Gadget_Gucci Mane.txt
Processed row 4619: data\__690710_Go For It_Gucci Mane.txt
Processed row 4620: data\__118517_Gods Witness_Gucci Mane.txt
Processed row 4621: data\__7015914_Go_Gucci Mane.txt
Processed row 4623: data\__64024_Girls Kissing Girls_Gucci Mane.txt
Processed row 4624: data\__1558883_Giant_Gucci Mane.txt
Processed row 4625: data\__1113007_Ghetto_Gucci Mane.txt
Processed row 4627: data\__1270128_Gucci Bandana_Gucci Mane.txt
Processed row 4628: data\__2390761_Get My Pistol_Gucci Mane.txt
Processed row 4629: data\__6900981_Gettin Cold_Gucci Mane.txt
Processed row 4630: data\__2379709_Goin_Gucci Mane.txt
Processed row 4631: data\__9

Processed row 5051: data\__2450_Pray_JAY_Z.txt
Processed row 5053: data\__268_Politics as Usual_JAY_Z.txt
Processed row 5054: data\__182393_Picasso Baby_JAY_Z.txt
Processed row 5057: data\__139186_Open Letter_JAY_Z.txt
Processed row 5059: data\__24456_Oh My God_JAY_Z.txt
Processed row 5061: data\__179701_Oceans_JAY_Z.txt
Processed row 5062: data\__24559_NYMP_JAY_Z.txt
Processed row 5063: data\__1274_No Hook_JAY_Z.txt
Processed row 5065: data\__813_Public Service Announcement_JAY_Z.txt
Processed row 5067: data\__4236_So Ambitious_JAY_Z.txt
Processed row 5068: data\__2583_Snoopy Track_JAY_Z.txt
Processed row 5070: data\__24517_Show You How_JAY_Z.txt
Processed row 5071: data\__919_Show Me What You Got_JAY_Z.txt
Processed row 5077: data\__1062_Reminder_JAY_Z.txt
Processed row 5082: data\__24513_Diamond Is Forever_JAY_Z.txt
Processed row 5083: data\__181082_BBC_JAY_Z.txt
Processed row 5086: data\__58499_B2K - Freestyle_JAY_Z.txt
Processed row 5089: data\__24502_As One_JAY_Z.txt
Processed ro

Processed row 5424: data\__61448_Higher Learning_Jeezy.txt
Processed row 5425: data\__6187465_Here We Go_Jeezy.txt
Processed row 5426: data\__48405_Hustlaz Ambition_Jeezy.txt
Processed row 5428: data\__102490_Just Got Word_Jeezy.txt
Processed row 5429: data\__55700_Jizzle_Jeezy.txt
Processed row 5430: data\__77955_Jeezy The Snowman_Jeezy.txt
Processed row 5431: data\__48379_J.E.E.Z.Y._Jeezy.txt
Processed row 5432: data\__2361029_J Bo_Jeezy.txt
Processed row 5433: data\__2894259_It Is What It Is_Jeezy.txt
Processed row 5434: data\__340803_I Remember_Jeezy.txt
Processed row 5435: data\__4841222_Introduction_Jeezy.txt
Processed row 5436: data\__108592_Intro_Jeezy.txt
Processed row 5437: data\__2882811_In the Air_Jeezy.txt
Processed row 5438: data\__307637_Insane_Jeezy.txt
Processed row 5439: data\__811514_In da Wall_Jeezy.txt
Processed row 5440: data\__3351455_In a Major Way_Jeezy.txt
Processed row 5441: data\__9683721_I Might Forgive_Jeezy.txt
Processed row 5442: data\__1899315_I Might_J

Processed row 5842: data\__5955219_24_Kanye West.txt
Processed row 5843: data\__38_Barry Bonds_Kanye West.txt
Processed row 5844: data\__5999243_Believe What I Say_Kanye West.txt
Processed row 5845: data\__688_Champion_Kanye West.txt
Processed row 5847: data\__2351_Celebration_Kanye West.txt
Processed row 5848: data\__7667906_Broken Road_Kanye West.txt
Processed row 5849: data\__3137_Bring Me Down_Kanye West.txt
Processed row 5851: data\__158615_Bound 2_Kanye West.txt
Processed row 5852: data\__25751_Bonnie & Clyde Freestyle_Kanye West.txt
Processed row 5853: data\__167180_Blood On the Leaves_Kanye West.txt
Processed row 5855: data\__2662_Bittersweet Poetry_Kanye West.txt
Processed row 5857: data\__3205404_Friday Morning, May 25th, 2007_Kanye West.txt
Processed row 5859: data\__7716956_Get Lost_Kanye West.txt
Processed row 5861: data\__7151075_Junya pt 2_Kanye West.txt
Processed row 5862: data\__3399_Late_Kanye West.txt
Processed row 5863: data\__7016903_Junya_Kanye West.txt
Processed 

Processed row 6136: data\__6250767_Heaven on Earth_Kid Cudi.txt
Processed row 6137: data\__8392239_In Love_Kid Cudi.txt
Processed row 6138: data\__122383_Immortal_Kid Cudi.txt
Processed row 6139: data\__2867519_ILLusions_Kid Cudi.txt
Processed row 6141: data\__54301_I Do My Thing_Kid Cudi.txt
Processed row 6142: data\__2329_Hyyerr_Kid Cudi.txt
Processed row 6143: data\__59401_Higher Up_Kid Cudi.txt
Processed row 6144: data\__8392238_Ignite The Love_Kid Cudi.txt
Processed row 6147: data\__27577_I Used to Love Him_Lauryn Hill.txt
Processed row 6148: data\__27600_Just Want You Around_Lauryn Hill.txt
Processed row 6150: data\__27591_Mr. Intentional_Lauryn Hill.txt
Processed row 6151: data\__27580_Nothing Even Matters_Lauryn Hill.txt
Processed row 6152: data\__27596_So Much Things to Say_Lauryn Hill.txt
Processed row 6154: data\__27583_Tell Him_Lauryn Hill.txt
Processed row 6155: data\__27588_The Conquering Lion_Lauryn Hill.txt
Processed row 6156: data\__27584_The Miseducation of Lauryn Hil

Processed row 6544: data\__70106_What He Does_Lil Wayne.txt
Processed row 6545: data\__28644_What U Kno_Lil Wayne.txt
Processed row 6547: data\__8913331_Where da Cash At_Lil Wayne.txt
Processed row 6549: data\__28596_Where You At_Lil Wayne.txt
Processed row 6550: data\__8913095_Whip Game_Lil Wayne.txt
Processed row 6553: data\__235578_Whoever You Like_Lil Wayne.txt
Processed row 6558: data\__583448_Secretary_Lil Wayne.txt
Processed row 6559: data\__54738_Two Shots_Lil Wayne.txt
Processed row 6560: data\__6217907_Something Different_Lil Wayne.txt
Processed row 6561: data\__2419_Something You Forgot_Lil Wayne.txt
Processed row 6562: data\__52289_Sorry 4 the Wait_Lil Wayne.txt
Processed row 6564: data\__49159_So Special_Lil Wayne.txt
Processed row 6565: data\__143161_Soul Survivor_Lil Wayne.txt
Processed row 6567: data\__2415_SportsCenter_Lil Wayne.txt
Processed row 6568: data\__366461_So Many Places_Lil Wayne.txt
Processed row 6569: data\__591887_Start a Fire_Lil Wayne.txt
Processed row 

Processed row 6959: data\__6446309_Stunt Double_Lil Yachty.txt
Processed row 6960: data\__2823622_Such Ease_Lil Yachty.txt
Processed row 6961: data\__3102140_Surrender_Lil Yachty.txt
Processed row 6962: data\__3565027_TALK TO ME NICE_Lil Yachty.txt
Processed row 6963: data\__8763580_The Alchemist._Lil Yachty.txt
Processed row 6965: data\__8763574_​the ride-_Lil Yachty.txt
Processed row 6966: data\__8763576_THE zone~_Lil Yachty.txt
Processed row 6968: data\__7125752_Three Six Talk_Lil Yachty.txt
Processed row 6970: data\__3060693_Moments in Time_Lil Yachty.txt
Processed row 6971: data\__3539595_MICKEY_Lil Yachty.txt
Processed row 6972: data\__3565033_DAS CAP_Lil Yachty.txt
Processed row 6974: data\__2823617_DipSet_Lil Yachty.txt
Processed row 6975: data\__3060677_Dirty Mouth_Lil Yachty.txt
Processed row 6976: data\__3060674_DN Freestyle_Lil Yachty.txt
Processed row 6977: data\__3540537_COUNT ME IN_Lil Yachty.txt
Processed row 6978: data\__6554411_Faizon_Lil Yachty.txt
Processed row 6980

Processed row 7338: data\__9439649_Antidote_Logic.txt
Processed row 7339: data\__9532234_A message from my younger self_Logic.txt
Processed row 7341: data\__197351_Alright_Logic.txt
Processed row 7343: data\__69740_Back and Forth_Logic.txt
Processed row 7345: data\__8699445_Insipio_Logic.txt
Processed row 7346: data\__7042850_Inside_Logic.txt
Processed row 7348: data\__8076670_In my lifetime_Logic.txt
Processed row 7349: data\__3050781_Ink Blot_Logic.txt
Processed row 7351: data\__72929_Inception_Logic.txt
Processed row 7352: data\__73851_I Made It_Logic.txt
Processed row 7354: data\__4409626_I Love You Forever_Logic.txt
Processed row 7356: data\__98659_I Just Wanna_Logic.txt
Processed row 7357: data\__8076679_I guess I love it_Logic.txt
Processed row 7358: data\__8076721_Introducing Nezi_Logic.txt
Processed row 7359: data\__141886_I Want It All_Logic.txt
Processed row 7360: data\__75813_Life of a Don_Logic.txt
Processed row 7361: data\__4422298_DeLorean_Logic.txt
Processed row 7362: d

Processed row 7680: data\__5849_All Around The World_Mac Miller.txt
Processed row 7682: data\__3835477_2009_Mac Miller.txt
Processed row 7683: data\__69522_1 Threw 8_Mac Miller.txt
Processed row 7685: data\__56397_Blue Slide Park_Mac Miller.txt
Processed row 7686: data\__5162486_Blue World_Mac Miller.txt
Processed row 7688: data\__59680_Boom Bap Rap_Mac Miller.txt
Processed row 7689: data\__4240_Donald Trump_Mac Miller.txt
Processed row 7690: data\__56406_Diamonds & Gold_Mac Miller.txt
Processed row 7691: data\__359814_Diablo_Mac Miller.txt
Processed row 7692: data\__69127_Desperado_Mac Miller.txt
Processed row 7694: data\__2305087_Cut the Check_Mac Miller.txt
Processed row 7695: data\__3835472_Conversation, Pt. 1_Mac Miller.txt
Processed row 7696: data\__2843734_Congratulations_Mac Miller.txt
Processed row 7697: data\__5162485_Complicated_Mac Miller.txt
Processed row 7698: data\__187433_Doobie Ashtray 2K10_Mac Miller.txt
Processed row 7699: data\__426867_Colors and Shapes_Mac Miller.t

Processed row 7967: data\__8285413_Who Me_Megan Thee Stallion.txt
Processed row 7968: data\__3772224_Make a Bag_Megan Thee Stallion.txt
Processed row 7969: data\__5319977_Savage_Megan Thee Stallion.txt
Processed row 7971: data\__4659982_Right Back_Megan Thee Stallion.txt
Processed row 7972: data\__4544214_Running Up Freestyle_Megan Thee Stallion.txt
Processed row 7975: data\__6104067_Megan Monday Freestyle_Megan Thee Stallion.txt
Processed row 7976: data\__3772181_Mustard & Mayonaise_Megan Thee Stallion.txt
Processed row 7979: data\__3772221_Neva_Megan Thee Stallion.txt
Processed row 7980: data\__8285400_Not Nice_Megan Thee Stallion.txt
Processed row 7981: data\__7329743_Opposite Day_Megan Thee Stallion.txt
Processed row 7982: data\__8285399_Ms. Nasty_Megan Thee Stallion.txt
Processed row 7983: data\__4544193_Realer_Megan Thee Stallion.txt
Processed row 7984: data\__6180692_Outside_Megan Thee Stallion.txt
Processed row 7985: data\__8285398_Red Wine_Megan Thee Stallion.txt
Processed row

Processed row 8492: data\__7519228_Wave Gods_Nas.txt
Processed row 8494: data\__687_Where Are They Now_Nas.txt
Processed row 8496: data\__34772_War_Nas.txt
Processed row 8498: data\__4672342_Vernon Family_Nas.txt
Processed row 8499: data\__4180_Untitled_Nas.txt
Processed row 8501: data\__5864557_Ultra Black_Nas.txt
Processed row 8502: data\__34816_U Gotta Love It_Nas.txt
Processed row 8503: data\__4672351_War Against Love_Nas.txt
Processed row 8504: data\__3780017_White Label_Nas.txt
Processed row 8505: data\__34791_Zone Out_Nas.txt
Processed row 8507: data\__4672356_You Mean the World to Me_Nas.txt
Processed row 8508: data\__34770_You Know My Style_Nas.txt
Processed row 8510: data\__7519227_Wu for the Children_Nas.txt
Processed row 8512: data\__53063_Who Are You_Nas.txt
Processed row 8513: data\__3686_Suicide Bounce_Nas.txt
Processed row 8514: data\__7519224_Ugly_Nas.txt
Processed row 8515: data\__9520258_TSK_Nas.txt
Processed row 8517: data\__6455127_The G.O.D._Nas.txt
Processed row 

Processed row 8904: data\__36283_Bowtie_OutKast.txt
Processed row 8905: data\__36248_BuggFace_OutKast.txt
Processed row 8906: data\__1270_Babylon_OutKast.txt
Processed row 8912: data\__1867_Jazzy Belle_OutKast.txt
Processed row 8913: data\__538016_Interlude_OutKast.txt
Processed row 8914: data\__1811647_In Your Dreams_OutKast.txt
Processed row 8915: data\__36250_Greatest Show on Earth_OutKast.txt
Processed row 8917: data\__36273_Funky Ride_OutKast.txt
Processed row 8920: data\__2461_GhettoMusick_OutKast.txt
Processed row 8922: data\__4179307_Place_Playboi Carti.txt
Processed row 8923: data\__6210730_Over_Playboi Carti.txt
Processed row 8924: data\__3053535_Other Shit_Playboi Carti.txt
Processed row 8925: data\__6303399_On That Time_Playboi Carti.txt
Processed row 8926: data\__3710584_Old Money_Playboi Carti.txt
Processed row 8927: data\__3053536_NO. 9_Playboi Carti.txt
Processed row 8928: data\__3710718_No Time_Playboi Carti.txt
Processed row 8929: data\__6303401_No Sl33p_Playboi Carti

Processed row 9239: data\__218134_40 Acres_Pusha T.txt
Processed row 9240: data\__7627286_Dreamin Of The Past_Pusha T.txt
Processed row 9241: data\__56486_Everything That Glitters_Pusha T.txt
Processed row 9242: data\__7894101_Call My Bluff_Pusha T.txt
Processed row 9243: data\__2369618_F.I.F.A._Pusha T.txt
Processed row 9244: data\__5795_Feeling Myself_Pusha T.txt
Processed row 9245: data\__7902121_Let The Smokers Shine The Coupes_Pusha T.txt
Processed row 9246: data\__222426_Let Me Love You_Pusha T.txt
Processed row 9247: data\__2376848_Keep Dealing_Pusha T.txt
Processed row 9248: data\__7919057_Just So You Remember_Pusha T.txt
Processed row 9249: data\__5788_I Still Wanna_Pusha T.txt
Processed row 9250: data\__7898834_I Pray For You_Pusha T.txt
Processed row 9251: data\__222102_King Push_Pusha T.txt
Processed row 9252: data\__3721023_Infrared_Pusha T.txt
Processed row 9253: data\__3721012_If You Know You Know_Pusha T.txt
Processed row 9254: data\__112956_I Am Forgiven_Pusha T.txt
Pr

Processed row 9500: data\__438_Treat Her Like a Prostitute_Slick Rick.txt
Processed row 9501: data\__42761_Unify_Slick Rick.txt
Processed row 9502: data\__42731_Venus_Slick Rick.txt
Processed row 9503: data\__42753_Why, Why, Why_Slick Rick.txt
Processed row 9504: data\__42744_King Piece in the Chess Game_Slick Rick.txt
Processed row 9506: data\__42724_King_Slick Rick.txt
Processed row 9507: data\__42740_Kill Niggaz_Slick Rick.txt
Processed row 9509: data\__42710_Behind Bars_Slick Rick.txt
Processed row 9510: data\__42723_Bond_Slick Rick.txt
Processed row 9513: data\__42736_Frozen_Slick Rick.txt
Processed row 9514: data\__42721_Get a Job_Slick Rick.txt
Processed row 9517: data\__42702_Indian Girl - An Adult Story_Slick Rick.txt
Processed row 9519: data\__3645663_Praise Him_Snoop Dogg.txt
Processed row 9520: data\__42965_Press Play_Snoop Dogg.txt
Processed row 9521: data\__43054_Promise I_Snoop Dogg.txt
Processed row 9522: data\__3007250_Promise You This_Snoop Dogg.txt
Processed row 9523

Processed row 9791: data\__42949_Bathtub_Snoop Dogg.txt
Processed row 9792: data\__43042_Bang Out_Snoop Dogg.txt
Processed row 9793: data\__3493338_Blessing Me Again_Snoop Dogg.txt
Processed row 9795: data\__1779938_Deez Hollywood Nights_Snoop Dogg.txt
Processed row 9798: data\__43004_Issues_Snoop Dogg.txt
Processed row 9799: data\__43133_Intrology_Snoop Dogg.txt
Processed row 9800: data\__2391638_Intro_Snoop Dogg.txt
Processed row 9801: data\__3529363_In the Name of Jesus_Snoop Dogg.txt
Processed row 9803: data\__43034_I Miss That Bitch_Snoop Dogg.txt
Processed row 9804: data\__5466_Imagine_Snoop Dogg.txt
Processed row 9805: data\__43150_I Love My Momma_Snoop Dogg.txt
Processed row 9806: data\__1779951_I Knew That_Snoop Dogg.txt
Processed row 9807: data\__4779571_I C Your Bullshit_Snoop Dogg.txt
Processed row 9810: data\__447855_House Shoes_Snoop Dogg.txt
Processed row 9811: data\__7688149_House I Built_Snoop Dogg.txt
Processed row 9812: data\__43037_Hourglass_Snoop Dogg.txt
Processed

Processed row 10198: data\__6921200_THE SHINING_Vince Staples.txt
Processed row 10199: data\__7870684_THE BLUES_Vince Staples.txt
Processed row 10200: data\__7870667_THE BEACH_Vince Staples.txt
Processed row 10201: data\__6968027_THE APPLE & THE TREE_Vince Staples.txt
Processed row 10202: data\__52334_Taxi_Vince Staples.txt
Processed row 10203: data\__6968026_TAKING TRIPS_Vince Staples.txt
Processed row 10204: data\__62587_Swiss Army_Vince Staples.txt
Processed row 10205: data\__2032827_Surf_Vince Staples.txt
Processed row 10206: data\__60784_Super_Vince Staples.txt
Processed row 10207: data\__6968025_SUNDOWN TOWN_Vince Staples.txt
Processed row 10208: data\__2032815_Summertime_Vince Staples.txt
Processed row 10209: data\__2032837_Street Punks_Vince Staples.txt
Processed row 10210: data\__62565_SOB_Vince Staples.txt
Processed row 10211: data\__7870678_SLIDE_Vince Staples.txt
Processed row 10212: data\__6968029_MHM_Vince Staples.txt
Processed row 10213: data\__7870680_PAPERCUTS_Vince St

Processed row 10540: data\__654294_Dead Fo Real_Young Thug.txt
Processed row 10542: data\__2437013_Digits_Young Thug.txt
Processed row 10543: data\__786788_Dome_Young Thug.txt
Processed row 10544: data\__455596_Donald Trump_Young Thug.txt
Processed row 10546: data\__3121336_Do U Love Me_Young Thug.txt
Processed row 10548: data\__795378_Dream_Young Thug.txt
Processed row 10549: data\__595614_Drop Bars_Young Thug.txt
Processed row 10551: data\__4786043_Ecstasy_Young Thug.txt
Processed row 10552: data\__2310923_Epic_Young Thug.txt
Processed row 10553: data\__7290587_Faces_Young Thug.txt
Processed row 10554: data\__2418041_Family_Young Thug.txt
Processed row 10558: data\__507566_You the World_Young Thug.txt
Processed row 10559: data\__474243_1017 Lifestyle_Young Thug.txt
Processed row 10560: data\__141521_2 Cups Stuffed_Young Thug.txt
Processed row 10561: data\__529773_730_Young Thug.txt
Processed row 10562: data\__510076_911_Young Thug.txt
Processed row 10563: data\__684910_Above Dem Nigg

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist,file_exists,hip_hop_era,words_count,count_unique_words,most_repeated_word,most_repeated_word_count,top2_word,top2_word_count,top3_word,top3_word_count,top4_word,top4_word_count,top5_word,top5_word_count
10564,Abracadabra,BUSINESS IS BUSINESS,2023,['Travis Scott'],"['Metro Boomin', 'London on da Track']","['Young Thug', 'Travis Scott', 'Metro Boomin',...",9263160,859407,Young Thug,True,Melodic/Mumble Era Hip Hop,623.0,202,yeah,26,woo,15,ha,14,woah,6,lookin',6
10566,Admit It,I Came from Nothing 3,2012,"['Kosher', 'Skypad War']",['DJ Suede The Remix God'],"['DJ Suede The Remix God', 'Young Thug', 'Skyp...",213292,18957,Young Thug,True,Blog Era Hip Hop,509.0,273,im,8,yo,5,gone,5,baby,4,admit,4
10567,Again,Slime Season,2015,['Gucci Mane'],['London on da Track'],"['London on da Track', 'Young Thug', 'Gucci Ma...",2149787,126701,Young Thug,True,Melodic/Mumble Era Hip Hop,507.0,219,baby,9,know,7,woo,7,nigga,6,guwop,5
10568,Alphabetical Order,1017 Thug 3: The Finale,2014,[],['Metro Boomin'],"['Metro Boomin', 'Young Thug']",510041,111193,Young Thug,True,Blog Era Hip Hop,338.0,161,see,7,wanna,6,please,6,waste,6,time,6
10569,Amazing,Barter 6,2015,['Jacquees'],['Wheezy'],"['Jacquees', 'Allee Willis', 'Maurice White', ...",802399,122065,Young Thug,True,Melodic/Mumble Era Hip Hop,613.0,280,know,9,might,8,bitch,7,amazin',5,wearin',5
10570,Angry Sex,I Came from Nothing 3,2012,[],['Jay Leno Beatz'],"['Jay Leno Beatz', 'Young Thug']",2165968,18957,Young Thug,True,Blog Era Hip Hop,407.0,148,sex,30,angry,30,that's,12,ha,12,th,6
10571,Around My Way,1017 Thug 3: The Finale,2014,['Lil Duke'],"['DJ Spinz', 'C4Bombs']","['C4Bombs', 'DJ Spinz', 'Young Thug', 'Lil Duke']",508886,111193,Young Thug,True,Blog Era Hip Hop,372.0,179,gone,13,way,9,fuck,9,around,8,know,6
10572,Ball,1017 Thug,2013,['OG Boo Dirty'],['Memphis Track Boy'],"['Memphis Track Boy', 'Young Thug', 'OG Boo Di...",460230,32075,Young Thug,True,Blog Era Hip Hop,411.0,182,bitch,15,ball,10,nigga,8,wall,7,that's,5
10573,Beast,Slime Season 2,2015,[],['Wheezy'],"['Wheezy', 'Young Thug']",2344356,134201,Young Thug,True,Melodic/Mumble Era Hip Hop,474.0,205,nigga,19,motherfucking,16,beast,16,know,13,'cause,9
10575,Best Friend,Slime Season,2015,[],"['Yung Shad', 'Ricky Racks']","['Ricky Racks', 'Yung Shad', 'Young Thug']",2165110,126701,Young Thug,True,Melodic/Mumble Era Hip Hop,530.0,222,friend,13,bitch,12,best,12,nigga,10,that's,9


In [222]:
#export df:
csv_file_path = "data/songs_metadata_df_with_counts_and_tops.csv"
songs_metadata_df.to_csv(csv_file_path, index=False, encoding='utf-8-sig') 

## I will need to further create dataframes for each artist to assess their respective totals.

One thing is to assess (unique) word counts per song, another is checking the broader scope. I want to check each rappers' most frequent word, not just their songs'.

#### First compiling every lyrics into one single .txt for each artist:

Testing for one artist only:

In [155]:
artists_songs_metadata = [
    "2Pac", "21 Savage", "50 Cent", "A Tribe Called Quest",
    "A_AP Rocky", "Aesop Rock", "Atmosphere", "Azealia Banks",
    "Baby Keem", "Big Daddy Kane", "Big L", "Big Pun",
    "Big Sean", "Cardi B", "Chance the Rapper", "Chief Keef",
    "Childish Gambino", "Common", "Cordae", "Cypress Hill",
    "Danny Brown", "De La Soul", "DMX", "Doja Cat",
    "Dr_ Dre", "Drake", "Earl Sweatshirt", "Eminem",
    "Eric B_ _ Rakim", "Future", "Ghostface Killah", "Gucci Mane",
    "Hopsin", "Ice Cube", "Ice Spice", "J. Cole",
    "Jack Harlow", "JAY_Z", "Jeezy", "Joey Bada__",
    "Joyner Lucas", "Juice WRLD", "Kanye West", "Kendrick Lamar",
    "Kid Cudi", "KMD", "Lauryn Hill", "Lil Nas X",
    "Lil Uzi Vert", "Lil Wayne", "Lil Yachty", "Lil_ Kim",
    "Logic", "Lupe Fiasco", "Mac Miller", "Masta Ace",
    "Megan Thee Stallion", "MF DOOM", "Migos", "Missy Elliott",
    "Mobb Deep", "Nas", "Nipsey Hussle", "OutKast",
    "Playboi Carti", "Pop Smoke", "Public Enemy", "Pusha T",
    "Queen Latifah", "Run_DMC", "Scarface", "Slick Rick",
    "Snoop Dogg", "Talib Kweli", "The Notorious B.I.G.", "The Pharcyde",
    "Travis Scott", "Vince Staples", "Wu_Tang Clan", "Yasiin Bey",
    "Young Thug"
]

In [54]:
# data_folder = "data"
# #list of all .txt files in the data folder
# all_files = [file for file in os.listdir(data_folder) if file.endswith(".txt")]

# # Filter files where the artist is "50 Cent"
# _50_Cent_files = [file for file in all_files if "__50 Cent" in file]

# # Define the path for the output file
# output_file_path = os.path.join(data_folder, "_50 Cent_style.txt")

# # Function to read content while skipping the first row
# def read_file_with_skipped_first_row(file_path):
#     with open(file_path, 'r', encoding='utf-8') as file:
#         lines = file.readlines()[1:]  # Skip the first row
#         return ''.join(lines)

# # Combine the content of 2Pac files
# combined_content = ""
# for _50_Cent_file in _50_Cent_files:
#     file_path = os.path.join(data_folder, _50_Cent_file)
#     combined_content += read_file_with_skipped_first_row(file_path) + '\n'

# # Write the combined content to the output file with UTF-8 encoding
# with open(output_file_path, 'w', encoding='utf-8') as output_file:
#     output_file.write(combined_content)

# print("Files have been successfully combined for 50 Cent.")


Files have been successfully combined for 50 Cent.


#### This but for everyone: (commented out because I performed some manual cleaning on these files, some of them were corrupted and I don't want to overwrite them)

In [126]:
# #I was getting decoding errors for 5 artists, so I've added a twist to decode in binary

# artists_songs_metadata = [
#     "2Pac", "21 Savage", "50 Cent", "A Tribe Called Quest",
#     "A_AP Rocky", "Aesop Rock", "Atmosphere", "Azealia Banks",
#     "Baby Keem", "Big Daddy Kane", "Big L", "Big Pun",
#     "Big Sean", "Cardi B", "Chance the Rapper", "Chief Keef",
#     "Childish Gambino", "Common", "Cordae", "Cypress Hill",
#     "Danny Brown", "De La Soul", "DMX", "Doja Cat",
#     "Dr_ Dre", "Drake", "Earl Sweatshirt", "Eminem",
#     "Eric B_ _ Rakim", "Future", "Ghostface Killah", "Gucci Mane",
#     "Hopsin", "Ice Cube", "Ice Spice", "J. Cole",
#     "Jack Harlow", "JAY_Z", "Jeezy", "Joey Bada__",
#     "Joyner Lucas", "Juice WRLD", "Kanye West", "Kendrick Lamar",
#     "Kid Cudi", "KMD", "Lauryn Hill", "Lil Nas X",
#     "Lil Uzi Vert", "Lil Wayne", "Lil Yachty", "Lil_ Kim",
#     "Logic", "Lupe Fiasco", "Mac Miller", "Masta Ace",
#     "Megan Thee Stallion", "MF DOOM", "Migos", "Missy Elliott",
#     "Mobb Deep", "Nas", "Nipsey Hussle", "OutKast",
#     "Playboi Carti", "Pop Smoke", "Public Enemy", "Pusha T",
#     "Queen Latifah", "Run_DMC", "Scarface", "Slick Rick",
#     "Snoop Dogg", "Talib Kweli", "The Notorious B.I.G.", "The Pharcyde",
#     "Travis Scott", "Vince Staples", "Wu_Tang Clan", "Yasiin Bey",
#     "Young Thug"
# ]
# # Path to the data folder
# data_folder_path = "data"

# # Iterate through each artist
# for artist in artists_songs_metadata:
#     # Initialize an empty list to store the content of song files
#     song_content = []

#     # Iterate through files in the data folder
#     for filename in os.listdir(data_folder_path):
#         # Check if the file follows the naming convention for the current artist
#         if filename.startswith(f"__") and f"_{artist}" in filename and filename.endswith(".txt"):
#             # Read the content of the file, skipping the first row
#             with open(os.path.join(data_folder_path, filename), "rb") as file:
#                 # Decode the content using UTF-8
#                 content = file.read().decode("utf-8")
#                 song_content.extend(content.splitlines()[1:])

#     # Write the compiled content to the artist's style file
#     style_file_path = os.path.join(data_folder_path, f"{artist}_style.txt")
#     with open(style_file_path, "w", encoding="utf-8") as style_file:
#         style_file.write("\n".join(song_content))
    
#     print(f"Files have been successfully combined for {artist}.")
    

# print("Compilation complete.")


Files have been successfully combined for 2Pac.
Files have been successfully combined for 21 Savage.
Files have been successfully combined for 50 Cent.
Files have been successfully combined for A Tribe Called Quest.
Files have been successfully combined for A_AP Rocky.
Files have been successfully combined for Aesop Rock.
Files have been successfully combined for Atmosphere.
Files have been successfully combined for Azealia Banks.
Files have been successfully combined for Baby Keem.
Files have been successfully combined for Big Daddy Kane.
Files have been successfully combined for Big L.
Files have been successfully combined for Big Pun.
Files have been successfully combined for Big Sean.
Files have been successfully combined for Cardi B.
Files have been successfully combined for Chance the Rapper.
Files have been successfully combined for Chief Keef.
Files have been successfully combined for Childish Gambino.
Files have been successfully combined for Common.
Files have been successful

Additional manual cleaning was performed because lots of files were fetched corrupted.

#### Second, creating the dataframe and populating it accordingly

In [163]:
songs_metadata_df.head()

Unnamed: 0,title,album,release_date,featured_artists,producer_artists,writer_artists,genius_track_id,genius_album_id,artist,file_exists,hip_hop_era,words_count,count_unique_words,most_repeated_word,most_repeated_word_count,top2_word,top2_word_count,top3_word,top3_word_count,top4_word,top4_word_count,top5_word,top5_word_count
0,1.5,i am﹥i was,2018,['Offset'],"['Nils', 'Wheezy']","['Nils', 'Wheezy', 'Offset', '21 Savage']",4155501,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,478.0,207,5,9,1,8,ride,8,die,8,slide,8
1,H2O,Free Guwop EP,2015,[],"['Sonny Digital', 'Zaytoven']","['Zaytoven', 'Sonny Digital', '21 Savage']",2251254,134590,21 Savage,True,Melodic/Mumble Era Hip Hop,434.0,129,water,26,whipping,26,kitchen,25,brick,23,cooked,18
2,​gun smoke,i am﹥i was,2018,[],"['Freek van Workum', 'ItsNicklus', 'Kid Hazel']","['Freek van Workum', 'ItsNicklus', 'Kid Hazel'...",4155500,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,526.0,196,smoke,40,gun,37,yeah,15,god,10,straight,9
3,​good day,i am﹥i was,2018,"['Project Pat', 'ScHoolboy Q']","['Cardo', '30 Roc']","['21 Savage', 'ScHoolboy Q', 'Project Pat', 'C...",4155511,477314,21 Savage,True,Melodic/Mumble Era Hip Hop,628.0,225,day,27,today,24,good,24,murder,8,uh,8
4,Gang,The Slaughter Tape,2015,[],['F12'],"['F12', '21 Savage']",2442069,133503,21 Savage,True,Melodic/Mumble Era Hip Hop,539.0,149,gang,68,fuck,14,pull,7,know,7,bitch,5


In [177]:
print(stop_words)

{'above', 'here', 'j', 'not', 'your', 'their', 'yours', 'during', "didn't", 'between', 'am', 'is', '*', 'you', 'p', "shan't", 'mightn', "you'll", 'were', 'a', 'didn', 'will', 'itself', 'got', "I'm", 'c', 'f', 'have', 'wasn', "aren't", 'm', 'those', "wouldn't", 'they', 'hadn', 't', 'and', 'its', 'of', 'which', 'before', 'k', "weren't", 'or', 'with', 'only', "won't", 'w', 'while', 'all', 'wu', 'other', "i'll", 'i', 'just', 'couldn', 'qb', 'has', "you're", 'off', 'over', 'how', 'so', 'can', 'n', 'myself', 'r', 'our', 'b', 'needn', 'no', 'more', 'the', 'he', 'are', 'get', 'do', 'below', 'why', 'h', 'out', 'nor', 'niggas', 'should', "needn't", "'em", 'own', 'haven', 'this', 'had', 'being', 'into', 'himself', "she's", 'where', 'an', 'like', 'ourselves', 'ma', 'by', 'both', 'hasn', 'further', 'what', 'on', 'doesn', 'ft', 'herself', 'from', 'hers', 'him', 'in', 'q', 'was', 'her', '&', 'v', "ain't", 'mustn', 'about', 'if', 're', 'me', 'such', 'that', "don't", "mustn't", "should've", 'as', 'whom

Testing for one artist only:

In [164]:
#I'm creating a df for the artist:
_2Pac_df = pd.DataFrame(columns=[
    'artist',
    'number_of_albums',
    'number_of_songs',
    'number_of_words',
    'number_of_unique_words'])

#artist
#populating the artist column with their name:
_2Pac_df["artist"]=["2Pac"]
_2Pac_df

Unnamed: 0,artist,number_of_albums,number_of_songs,number_of_words,number_of_unique_words
0,2Pac,,,,


In [165]:
#I'm creating a df for the artist:
_2Pac_df = pd.DataFrame(columns=[
    'artist',
    'number_of_albums',
    'number_of_songs',
    'number_of_words',
    'number_of_unique_words'])

nltk.download('stopwords')
# Define a list of stop words
stop_words = set(stopwords.words('english'))
#the imported library only gives us so much. After the first couple tries, I saw some other stuff I don't want there
additional_stop_words = ["*","get","ft","wu","niggas","&","na","'em","la","ain't","like","got","I'm", "i'm","gg", "gon'", "i'll", "da", "p", "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z", "qb"]
stop_words.update(additional_stop_words)

#artist
#populating the artist column with their name:
_2Pac_df["artist"]=["2Pac"]

#number_of_albums
# populating the albums column with the unique values from the songs_metadata_df where the artist matches
unique_albums = songs_metadata_df[songs_metadata_df['artist'] == '2Pac']['album'].unique()
number_of_albums = len(unique_albums)
_2Pac_df["number_of_albums"]=number_of_albums

#number_of_songs
# populating the songs column with the unique values from the songs_metadata_df where the artist matches
unique_songs = songs_metadata_df[songs_metadata_df['artist'] == '2Pac']['title'].unique()
number_of_songs = len(unique_songs)
_2Pac_df["number_of_songs"]=number_of_songs

#number_of_words
# Characters to exclude from words
exclude_chars = set('"(),\:?.!-"')  # Once again, we need to sanitize this as we've done with the artists' names
file_name = "2Pac_style.txt"
file_path = os.path.join("data", file_name)

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        # Removing special characters from content
        for char in exclude_chars:
            content = content.replace(char, '')
        
        words = content.split()
        word_count = len(words)
        print(f"The number of words in the file '{file_path}' is: {word_count}")
except FileNotFoundError:
    print(f"The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

_2Pac_df["number_of_words"]=word_count

#number_of_unique_words
# Characters to exclude from words
exclude_chars = set('"(),\:?.!-"')  # Once again, we need to sanitize this as we've done with the artists' names
file_name = "2Pac_style.txt"
file_path = os.path.join("data", file_name)

try:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

        # Removing special characters from content
        for char in exclude_chars:
            content = content.replace(char, '')

        words = content.split()
        word_count = len(words)
        unique_word_count = len(set(words))  #basically doing the same but with the help of our dearest set type
        print(f"The number of unique words is: {unique_word_count}")
except FileNotFoundError:
    print(f"The file '{file_path}' was not found.")
except Exception as e:
    print(f"An error occurred: {e}")

_2Pac_df["number_of_unique_words"] = unique_word_count


#most_frequent_word and most_frequent_word_count 
_2Pac_df['top2_word'] = ""
_2Pac_df['top2_word_count'] = 0  
_2Pac_df['top3_word'] = "" 
_2Pac_df['top3_word_count'] = 0  
_2Pac_df['top4_word'] = ""  
_2Pac_df['top4_word_count'] = 0  
_2Pac_df['top5_word'] = ""
_2Pac_df['top5_word_count'] = 0 
try:
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        
        #I am basically going to reuse code from above
        words = ''.join(char.lower() if char.lower() not in exclude_chars else ' ' for char in content).split()
        filtered_words = [word for word in words if word not in stop_words]
        word_counts = Counter(filtered_words)
        most_repeated_words = word_counts.most_common(5)

        # Update the columns in _2Pac_df
        _2Pac_df.at[0, 'most_repeated_word'] = most_repeated_words[0][0]
        _2Pac_df.at[0, 'most_repeated_word_count'] = most_repeated_words[0][1]
        for i in range(1, 5):
            col_name_word = f'top{i + 1}_word'
            col_name_count = f'top{i + 1}_word_count'
            _2Pac_df.at[0, col_name_word] = most_repeated_words[i][0]
            _2Pac_df.at[0, col_name_count] = most_repeated_words[i][1]

except FileNotFoundError:
    print(f"File not found: {file_path}")
except Exception as e:
    print(f"An error occurred for file {file_path}: {e}")

_2Pac_df




[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mocid\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


The number of words in the file 'data\2Pac_style.txt' is: 180828
The number of unique words is: 11401


Unnamed: 0,artist,number_of_albums,number_of_songs,number_of_words,number_of_unique_words,top2_word,top2_word_count,top3_word,top3_word_count,top4_word,top4_word_count,top5_word,top5_word_count,most_repeated_word,most_repeated_word_count
0,2Pac,15,47,180828,11401,see,882,life,833,know,736,fuck,655,nigga,1296.0


#### Third, creating a placeholder dataframe to receive all of the artists concatenated df and iterating through the list to receive all the artists dataframe:

In [191]:
all_artists_stats = pd.DataFrame(columns=[
    'artist',
    'number_of_albums',
    'number_of_songs',
    'number_of_words',
    'number_of_unique_words','most_repeated_word','most_repeated_word_count','top2_word','top2_word_count',
    'top3_word','top3_word_count','top4_word','top4_word_count','top5_word','top5_word_count'])
all_artists_stats


Unnamed: 0,artist,number_of_albums,number_of_songs,number_of_words,number_of_unique_words,most_repeated_word,most_repeated_word_count,top2_word,top2_word_count,top3_word,top3_word_count,top4_word,top4_word_count,top5_word,top5_word_count


#### Fourth, the process for 2Pac but applied to everyone and iterating through an artist list to append the individual dataframes to the major all_artists_stats dataframe. Two versions are being created:

### One DIRTY version where we don't filter out the words with an english dictionary and everything is allowed:

In [188]:
# Function to check if a word is alphabetic
def is_alpha(word):
    return all(char.isalpha() or char.isspace() for char in word)

# Iterate through each artist
for artist in artists_songs_metadata:
    # Create a DataFrame for the current artist
    artist_df = pd.DataFrame(columns=[
        'artist',
        'number_of_albums',
        'number_of_songs',
        'number_of_words',
        'number_of_unique_words',
        'most_repeated_word',
        'most_repeated_word_count',
        'top2_word',
        'top2_word_count',
        'top3_word',
        'top3_word_count',
        'top4_word',
        'top4_word_count',
        'top5_word',
        'top5_word_count'
    ])

    # Populate the artist column
    artist_df["artist"] = [artist]

    # Populate number_of_albums and number_of_songs
    unique_albums = songs_metadata_df[songs_metadata_df['artist'] == artist]['album'].unique()
    artist_df["number_of_albums"] = len(unique_albums)

    unique_songs = songs_metadata_df[songs_metadata_df['artist'] == artist]['title'].unique()
    artist_df["number_of_songs"] = len(unique_songs)

    # Populate number_of_words and number_of_unique_words
    file_name = f"{artist}_style.txt"
    file_path = os.path.join("data", file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

            # Removing special characters from content
            for char in exclude_chars:
                content = content.replace(char, '')

            words = content.split()
            # Exclude words containing "Embed" and non-alphabetic characters
            words = [word for word in words if "Embed" not in word and is_alpha(word)]
            artist_df["number_of_words"] = len(words)
            artist_df["number_of_unique_words"] = len(set(words))
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred for file {file_path}: {e}")

    # Populate most_repeated_word and most_repeated_word_count
    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

            words = ''.join(char.lower() if char.lower() not in exclude_chars else ' ' for char in content).split()
            # Exclude words containing "Embed" and non-alphabetic characters
            filtered_words = [word for word in words if "Embed" not in word and is_alpha(word) and word not in stop_words]
            word_counts = Counter(filtered_words)
            most_repeated_words = word_counts.most_common(5)

            # Update the columns in artist_df
            artist_df.at[0, 'most_repeated_word'] = most_repeated_words[0][0]
            artist_df.at[0, 'most_repeated_word_count'] = most_repeated_words[0][1]
            for i in range(1, 5):
                col_name_word = f'top{i + 1}_word'
                col_name_count = f'top{i + 1}_word_count'
                artist_df.at[0, col_name_word] = most_repeated_words[i][0]
                artist_df.at[0, col_name_count] = most_repeated_words[i][1]

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred for file {file_path}: {e}")

    # Concatenate the artist_df to all_artists_stats
    all_artists_stats = pd.concat([all_artists_stats, artist_df], ignore_index=True)

#Exporting this dataframe:
csv_file_path = "data/all_artists_stats_DIRTY.csv"
all_artists_stats.to_csv(csv_file_path, index=False)
    
# Display the final DataFrame
pd.set_option('display.max_rows', None)
all_artists_stats.head(81)


Unnamed: 0,artist,number_of_albums,number_of_songs,number_of_words,number_of_unique_words,most_repeated_word,most_repeated_word_count,top2_word,top2_word_count,top3_word,top3_word_count,top4_word,top4_word_count,top5_word,top5_word_count
0,2Pac,15,47,163358,9324,nigga,1296,see,882,life,833,know,736,fuck,655
1,21 Savage,6,49,34375,3486,nigga,541,yeah,496,bitch,418,shit,270,gang,232
2,50 Cent,24,132,120554,8607,nigga,1467,shit,881,know,795,yeah,681,bitch,546
3,A Tribe Called Quest,7,42,58196,7581,know,339,yo,325,make,286,yeah,241,gotta,213
4,A_AP Rocky,5,23,11949,3117,nigga,66,shit,65,bitch,64,yeah,56,know,52
5,Aesop Rock,12,81,91215,16620,one,363,back,237,might,204,never,184,little,183
6,Atmosphere,26,152,93724,9641,know,502,might,329,love,324,make,277,time,269
7,Azealia Banks,6,51,25597,4434,ya,203,bitch,199,bitches,147,nigga,142,know,135
8,Baby Keem,6,53,19798,2420,yeah,333,bitch,264,ayy,137,might,134,go,108
9,Big Daddy Kane,6,42,48939,6672,cause,226,man,224,know,213,kane,191,one,187


### One CLEAN version where we don't filter out the words with an english dictionary and everything is allowed:

In [192]:
# Iterate through each artist
for artist in artists_songs_metadata:
    # Create a DataFrame for the current artist
    artist_df = pd.DataFrame(columns=[
        'artist',
        'number_of_albums',
        'number_of_songs',
        'number_of_words',
        'number_of_unique_words',
        'most_repeated_word',
        'most_repeated_word_count',
        'top2_word',
        'top2_word_count',
        'top3_word',
        'top3_word_count',
        'top4_word',
        'top4_word_count',
        'top5_word',
        'top5_word_count'
    ])

    # Populate the artist column
    artist_df["artist"] = [artist]

    # Populate number_of_albums and number_of_songs
    unique_albums = songs_metadata_df[songs_metadata_df['artist'] == artist]['album'].unique()
    artist_df["number_of_albums"] = len(unique_albums)

    unique_songs = songs_metadata_df[songs_metadata_df['artist'] == artist]['title'].unique()
    artist_df["number_of_songs"] = len(unique_songs)

    # Populate number_of_words and number_of_unique_words
    file_name = f"{artist}_style.txt"
    file_path = os.path.join("data", file_name)

    try:
        with open(file_path, 'r', encoding='utf-8') as file:
            content = file.read()

            # Remove special characters and split into words
            translator = str.maketrans('', '', ''.join(exclude_chars))
            words = content.translate(translator).split()

            # Exclude words containing "Embed" and non-alphabetic characters
            words = [word for word in words if "Embed" not in word and is_alpha(word) and is_english_word(word)]
            
            # Set the number_of_words and number_of_unique_words
            artist_df.at[0, "number_of_words"] = len(words)
            artist_df.at[0, "number_of_unique_words"] = len(set(words))
    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred for file {file_path}: {e}")

    # Populate most_repeated_word and most_repeated_word_count
    try:
        # Convert to lowercase and split into words
        words = content.lower().translate(translator).split()
        # Exclude words containing "Embed" and non-alphabetic characters
        filtered_words = [word for word in words if "Embed" not in word and is_alpha(word) and is_english_word(word) and word not in stop_words]
        word_counts = Counter(filtered_words)
        most_repeated_words = word_counts.most_common(5)

        # Update the columns in artist_df
        artist_df.at[0, 'most_repeated_word'] = most_repeated_words[0][0]
        artist_df.at[0, 'most_repeated_word_count'] = most_repeated_words[0][1]
        for i in range(1, 5):
            col_name_word = f'top{i + 1}_word'
            col_name_count = f'top{i + 1}_word_count'
            artist_df.at[0, col_name_word] = most_repeated_words[i][0]
            artist_df.at[0, col_name_count] = most_repeated_words[i][1]

    except FileNotFoundError:
        print(f"File not found: {file_path}")
    except Exception as e:
        print(f"An error occurred for file {file_path}: {e}")

    # Concatenate the artist_df to all_artists_stats
    all_artists_stats = pd.concat([all_artists_stats, artist_df], ignore_index=True)

#exporting a CLEAN version subject to the english dictionary filter:
#Exporting this dataframe:
csv_file_path = "data/all_artists_stats_CLEAN.csv"
all_artists_stats.to_csv(csv_file_path, index=False)
    
# Display the final DataFrame
pd.set_option('display.max_rows', None)
all_artists_stats.head(81)


Unnamed: 0,artist,number_of_albums,number_of_songs,number_of_words,number_of_unique_words,most_repeated_word,most_repeated_word_count,top2_word,top2_word_count,top3_word,top3_word_count,top4_word,top4_word_count,top5_word,top5_word_count
0,2Pac,15,47,145101,5873,see,880,life,832,know,733,time,578,thug,525
1,21 Savage,6,49,29770,2424,yeah,490,bitch,418,gang,224,money,180,know,179
2,50 Cent,24,132,106228,5265,know,795,yeah,654,bitch,544,man,526,see,498
3,A Tribe Called Quest,7,42,51054,4881,know,339,yo,314,make,286,yeah,238,man,209
4,A_AP Rocky,5,23,10479,2276,bitch,64,yeah,55,know,52,red,51,see,40
5,Aesop Rock,12,81,79858,10547,one,355,back,223,might,204,never,184,little,183
6,Atmosphere,26,152,85595,6453,know,501,might,329,love,325,make,275,take,265
7,Azealia Banks,6,51,22260,3023,ya,203,bitch,199,know,131,want,97,might,83
8,Baby Keem,6,53,17668,1769,yeah,332,bitch,264,might,134,go,107,want,98
9,Big Daddy Kane,6,42,44173,4543,cause,226,man,221,know,213,one,182,come,173


In [195]:
all_artists_stats["number_of_songs"].sum()

6948

In [None]:
#Exporting this dataframe:
csv_file_path = "data/all_artists_stats_DIRTY.csv"
all_artists_stats.to_csv(csv_file_path, index=False)