# Import Main Libraries

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Import Data

In [None]:
# store csv into panda
df = pd.read_csv("lyrics.csv")

In [None]:
# display panda info
df.info()
# check shape of panda
print("\n# of rows: {}".format(df.shape[0]))
print("# of columns: {}".format(df.shape[1]))
# check how data is organized
df.head()

# Clean Data
### Clean Data in General
There are missing data in the lyrics section with only 266557 non-null object compared to 362237 non-null objects for others.
We delete the index column as the order of the songs are not important.

In [None]:
# drop any row with missing data
df = df.dropna()
# drop column 'index'
df = df.drop('index',1)

In [None]:
# display panda info
df.info()
# check shape of data frame
print("\n# of rows: {}".format(df.shape[0]))
print("# of columns: {}".format(df.shape[1]))
# check how data is organized
df.head()

### Clean data by column 'songs'

In [None]:
# check the types of unique songs
songs = df['song'].unique()
# sort in descending order
songs.sort()
# display how many types of unique songs
print("# of unique songs: {}\n".format(len(songs)))
# print sorted array of unique songs
print(songs)

### Clean data by column 'years'

In [None]:
# check the types of unique years
years = df['year'].unique()
# sort in descending order
years.sort()
# display how many types of unique years
print("# of unique years: {}\n".format(len(years)))
# print sorted array of unique years
print(years)

As we can see from above, there are year 67, 112, 702.
We delete these years as they are not after 1900s.

In [None]:
# delete years < 1900s
df = df[(df['year'] > 1900)]
# check the types of unique years left
years = df['year'].unique()
# sort in descending order
years.sort()
# display how many types of unique years
print("# of unique years: {}\n".format(len(years)))
# print sorted array of unique years
print(years)

### Clean data by column 'artist'

In [None]:
# check the types of unique artists
artists = df['artist'].unique()
# sort in descending order
artists.sort()
# display how many types of unique artists
print("# of unique artists: {}\n".format(len(artists)))
# print sorted array of unique artists
print(artists)

### Clean data by column 'genre'

In [None]:
# check the types of unique genre
genres = df['genre'].unique()
# sort in descending order
genres.sort()
# display how many types of unique genres
print("# of unique genre: {}\n".format(len(genres)))
# print sorted array of unique genres
print(genres)

As we can see above, we have genre 'Not Available' and 'Other'.
To simplify, we delete them out of the data set.

In [None]:
# delete genres 'Not Available' and 'Other'
df = df[(df['genre'] != 'Not Available') & (df['genre'] != 'Other')]
# check how many types of unique genres left
genres = df['genre'].unique()
# sort in descending order
genres.sort()
# display how many types of unique genres
print("# of unique genres: {}\n".format(len(genres)))
# print sorted array of unique genres
print(genres)

### Clean data by column 'lyrics'

In [None]:
# check lyrics data
df['lyrics'].head()

In [None]:
# Replace all \n with space
df = df.replace({'\n':' '}, regex=True)

In [None]:
# check how data changed
df['lyrics'].head()

In [None]:
# count the words in each song
df['word_count'] = df['lyrics'].str.split().str.len()
df.head()

In [None]:
# check some statistic based on word counts grouped by genre
df['word_count'].groupby(df['genre']).describe()

It's odd that there are songs with 1 word from the min section of every genre.
Let's check what they are.

In [None]:
# display the top songs with 1 word
print("# of songs with 1 word: {}".format(len(df.loc[df['word_count'] == 1])))
df.loc[df['word_count'] == 1].head()

The 1-worded songs are mostly instrumental.
Let's delete them from the data set as they essentially don't have lyrics.

In [None]:
# delete songs with 1 word
df = df[df['word_count'] != 1]
# review data
df['word_count'].groupby(df['genre']).describe()

There are still some songs with 2, 3 words.
Let's check them if they have lyrics.

In [None]:
# display the top songs with 2 words
print("# of songs with 1 word: {}".format(len(df.loc[df['word_count'] == 2])))
df.loc[df['word_count'] == 2].head()

In [None]:
# display the top songs with 3 words
print("# of songs with 3 words: {}".format(len(df.loc[df['word_count'] == 3])))
df.loc[df['word_count']  == 3].head()

In [None]:
# display the top songs with 3 words
print("# of songs with 3 words: {}".format(len(df.loc[df['word_count'] < 50])))

In [None]:
# check the proportion of songs with less 100 words compared to the entire data set
print("# of songs < 50 words: {}".format(len(df[df['word_count'] < 50])))
print("# of total songs: {}".format(len(df)))
print("% of songs < 50 words: {}%".format(len(df[df['word_count'] < 50])/len(df)*100))

Since we are only eliminating about 3% of the entire data set by deleting songs less than 50 words, we will do this to simplify the data set.

In [None]:
# delete songs with 1 word
df = df[df['word_count'] > 50]
# review data
df['word_count'].groupby(df['genre']).describe()

Let's now check the overall distribution of the songs.

In [None]:
# check distribution of data set
sns.violinplot(x=df["word_count"])
plt.show()

The word count is extremely skewed to the right.
Let's see what the lyrics are for some high word counts.

In [None]:
# display the top songs with more then 8000 words
print("# of songs with 1 word: {}".format(len(df.loc[df['word_count'] > 8000])))
df.loc[df['word_count'] > 8000].head()

In [None]:
# display the top songs with more then 5000 words
print("# of songs with 1 word: {}".format(len(df.loc[df['word_count'] > 5000])))
df.loc[df['word_count'] > 5000].head()

In [None]:
# display the top songs with more then 1000 words
print("# of songs with 1 word: {}".format(len(df.loc[df['word_count'] > 1000])))
df.loc[df['word_count'] > 1000].head()

In [None]:
# check the proportion of songs with less 100 words compared to the entire data set
print("# of songs < 1000 words: {}".format(len(df[df['word_count'] > 1000])))
print("# of total songs: {}".format(len(df)))
print("% of songs < 1000 words: {}%".format(len(df[df['word_count'] > 1000])/len(df)*100))

Since there are only 0.25% of songs with 1000 words, let's eliminate them to simplify data set.

In [None]:
# delete songs with more than 1000 worda
df = df[df['word_count'] < 1000]
# review data
df['word_count'].groupby(df['genre']).describe()

In [None]:
# check distribution of data set
sns.violinplot(x=df["word_count"])
plt.show()

In [None]:
plt.rc("figure", figsize=(15, 6))
sns.boxplot(x="genre", y="word_count", data=df)
plt.show()

In [None]:
# display panda info
df.info()
# check shape of data frame
print("# of rows: {}\n".format(df.shape[0]))
print("# of columns: {}\n".format(df.shape[1]))
# check how data is organized
df.head()

In [None]:
genre = df.groupby(['genre']).count()
genre

In [None]:
artist = df.groupby(['artist']).count()
artist

In [None]:
from sklearn.model_selection import train_test_split
# shuffle and split dataset into training dataset and testing dataset
train, test = train_test_split(df, test_size=0.2)

In [None]:
# check shape of train panda
print("# of rows: {}\n".format(train.shape[0]))
print("# of columns: {}\n".format(train.shape[1]))
# check how train data is organized
train.head()

In [None]:
# check shape of test panda
print("# of rows: {}\n".format(test.shape[0]))
print("# of columns: {}\n".format(test.shape[1]))
# check how test data is organized
test.head()