## Our Voyager Data Clean Up

In [2]:
# import libraries

from IPython.display import SVG
import altair as alt
import glob as glob
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go


# setup plotting for quarto
alt.renderers.enable('default')
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook_connected"

# supress warnings
import warnings
warnings.filterwarnings('ignore')

## Load CSV Sheets

In [3]:
# url as variable
# this is a messy metadata file that we will clean up in this lab
our_messy_metadata_csv = 'https://docs.google.com/spreadsheets/d/e/2PACX-1vQi5q2j94W-HPrVsgtK0rsncYEu3ElLOk2p4MlEDW8vbBUXOEOKNNuN63Mohae062tmARINbpklDe-w/pub?gid=190053203&single=true&output=csv'
our_messy_metadata = pd.read_csv(our_messy_metadata_csv)

# other data to clean and tidy
# beatles spotify data
beatles_spotify_csv = 'https://raw.githubusercontent.com/RichardFreedman/Encoding_Music/refs/heads/main/02_Lab_Data/Beatles/M_255_Beatles_Spotify_2025.csv'
beatles_spotify = pd.read_csv(beatles_spotify_csv)

# beatles billboard and genre data from the Belgrade University project
beatles_billboard_csv = 'https://raw.githubusercontent.com/inteligentni/Class-05-Feature-engineering/master/The%20Beatles%20songs%20dataset%2C%20v1%2C%20no%20NAs.csv'
beatles_billboard = pd.read_csv(beatles_billboard_csv)





### Check Columns

In [8]:
# for exampleour_messy_metadata
our_messy_metadata.columns

Index(['Timestamp', 'Song Title', 'Your Name', 'Your Graduating Class',
       'Team/Group Name', 'Personal Rank', 'Genre Tags',
       'Most Likely Context(s)', 'Least Likely Contexts'],
      dtype='object')

In [9]:
our_messy_metadata.head()

Unnamed: 0,Timestamp,Song Title,Your Name,Your Graduating Class,Team/Group Name,Personal Rank,Genre Tags,Most Likely Context(s),Least Likely Contexts


In [None]:
# let's drop the Timestamp column
# this is not useful for our analysis
# it is just the time when the data was last edited and is not relevant to our analysis
our_messy_metadata = our_messy_metadata.drop(columns=['Timestamp'])

## 2. Clean the Easy Parts

- Your Name
- Team/Group
- Rank
- Class Year


In [None]:
# function to clean up names
def name_lower(name):
    # lower case
    name = name.lower()
    # remove _ in one name before comma
    name = name.replace("_,", ",")
    # remove extra space
    name = name.replace("_", " ")
    return name

# reverse names so we have surname first
def reverse_name_without_comma(name):
    words = name.split()
    return ', '.join(reversed(words))


# clean up team names
def team_clean(team):
    if "g" in team.lower():
        team = "green machine"
    elif "cr" in team.lower():
        team = "orange crush"
    elif "p" in team.lower():
        team = "purple phlurp"
    return team

# clean up personl rankings
def rank_clean(rank):
    if "1" in rank:
        rank = 1
    elif "2" in rank:
        rank = 2
    elif "3" in rank:
        rank = 3
    elif "4" in rank:
        rank = 4
    elif "5" in rank:
        rank = 5
    elif "6" in rank:
        rank = 6
    else:
        rank = 'unranked'
    return rank

# clean up class years (which start off as strings, but need to be integers)
def year_clean(year):
    if "5" in year:
        year = 2025
    elif "6" in year:
        year = 2026
    elif "7" in year:
        year = 2027
    elif "8" in year:
        year = 2028
    return year



### Apply the Cleaning Functions

In [None]:

# apply lower case functions
our_messy_metadata['Your Name'] = our_messy_metadata['Your Name'].apply(name_lower)
# Apply the function to the 'Name' column
our_messy_metadata['Your Name'] = our_messy_metadata['Your Name'].apply(lambda x: x if ',' in x else reverse_name_without_comma(x))
# Apply the function to the 'Name' column
our_messy_metadata['Team/Group Name'] = our_messy_metadata['Team/Group Name'].apply(team_clean)
# apply the rank function function
our_messy_metadata['Personal Rank'] = our_messy_metadata['Personal Rank'].apply(rank_clean)
# apply class clean
our_messy_metadata['Your Graduating Class'] = our_messy_metadata['Your Graduating Class'].apply(year_clean)

## 3. Clean Genre and Context Columns


In [None]:
# depending on how our teams decided to separate their terms, we may need to clean up the terms
def regular_separator(terms):
    terms = terms.replace("?", ", ")
    terms = terms.replace(";", ", ")
    return terms

In [None]:
# lower case all genres
our_messy_metadata['Genre Tags'] = our_messy_metadata['Genre Tags'].str.lower()
# regularize separator
our_messy_metadata['Genre Tags'] = our_messy_metadata['Genre Tags'].apply(regular_separator)
# split long strings to lists, on ","
our_messy_metadata['Genre Tags'] = our_messy_metadata['Genre Tags'].str.split(',')

# now the same for contexts
our_messy_metadata['Most Likely Context(s)'] = our_messy_metadata['Most Likely Context(s)'].str.lower()
our_messy_metadata['Most Likely Context(s)'] = our_messy_metadata['Most Likely Context(s)'].apply(regular_separator)
# split long strings to lists, on ","
our_messy_metadata['Most Likely Context(s)'] = our_messy_metadata['Most Likely Context(s)'].str.split(',')

our_messy_metadata['Least Likely Contexts'] = our_messy_metadata['Least Likely Contexts'].str.lower()
our_messy_metadata['Least Likely Contexts'] = our_messy_metadata['Least Likely Contexts'].apply(regular_separator)
# split long strings to lists, on ","
our_messy_metadata['Least Likely Contexts'] = our_messy_metadata['Least Likely Contexts'].str.split(',')


### 4. Explode the Data

Some of our columns have lists in them, which we need to explode into multiple rows, in keeping with Wickham's tidy data principles.

In [None]:
# explode on Genre Tags
our_messy_metadata_exploded = our_messy_metadata.explode('Genre Tags').reset_index(drop=True)
our_messy_metadata_exploded


### 5. Filter for a Given Genre

Now that we have exploded the data, we can filter for a given genre, such as rock.  Previously this would have been impossible, since the genre was a list in a single cell.

In [11]:
# filter for rock songs
rock_songs = our_messy_metadata_exploded[our_messy_metadata_exploded['Genre Tags'] == 'rock'].reset_index(drop=True)

rock_songs

Unnamed: 0,Timestamp,Song Title,Your Name,Your Graduating Class,Team/Group Name,Personal Rank,Genre Tags,Most Likely Context(s),Least Likely Contexts
