In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

In [2]:
df = pd.read_csv('../.data/NB_01_charts.csv')
df.head()

Unnamed: 0,date,rank,song,artist,last-week,peak-rank,weeks-on-board
0,2021-11-06,1,Easy On Me,Adele,1.0,1,3
1,2021-11-06,2,Stay,The Kid LAROI & Justin Bieber,2.0,1,16
2,2021-11-06,3,Industry Baby,Lil Nas X & Jack Harlow,3.0,1,14
3,2021-11-06,4,Fancy Like,Walker Hayes,4.0,3,19
4,2021-11-06,5,Bad Habits,Ed Sheeran,5.0,2,18


## Data Overview

| column | additional information |
|--------|------------------------|
| date | year/month/day |
| rank | 1 - 100 of this date | 
| song | Title of song |
| artist | Name of artist |
| last-week | rank from last week |
| peak-rank | highest rank |
| weeks-on-board | weeks in charts |

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 330087 entries, 0 to 330086
Data columns (total 7 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   date            330087 non-null  object 
 1   rank            330087 non-null  int64  
 2   song            330087 non-null  object 
 3   artist          330087 non-null  object 
 4   last-week       297775 non-null  float64
 5   peak-rank       330087 non-null  int64  
 6   weeks-on-board  330087 non-null  int64  
dtypes: float64(1), int64(3), object(3)
memory usage: 17.6+ MB


&rarr; Missing values in column 'last-week'!

In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
rank,330087.0,50.500929,28.866094,1.0,26.0,51.0,76.0,100.0
last-week,297775.0,47.591631,28.05436,1.0,23.0,47.0,72.0,100.0
peak-rank,330087.0,40.970629,29.347481,1.0,13.0,38.0,65.0,100.0
weeks-on-board,330087.0,9.161785,7.618264,1.0,4.0,7.0,13.0,90.0


In [5]:
df['artist'].nunique()

10205

In [6]:
list(df['artist'].values)

['Adele',
 'The Kid LAROI & Justin Bieber',
 'Lil Nas X & Jack Harlow',
 'Walker Hayes',
 'Ed Sheeran',
 'Drake Featuring Future & Young Thug',
 'Ed Sheeran',
 'Olivia Rodrigo',
 'Doja Cat',
 'Dua Lipa',
 'Wizkid Featuring Justin Bieber & Tems',
 'Doja Cat Featuring SZA',
 'Glass Animals',
 'Maneskin',
 'Elton John & Dua Lipa',
 'Doja Cat & The Weeknd',
 'The Weeknd & Ariana Grande',
 'Jason Aldean & Carrie Underwood',
 'Olivia Rodrigo',
 'Coldplay x BTS',
 'Nardo Wick Featuring G Herbo, Lil Durk & 21 Savage',
 'Drake Featuring 21 Savage & Project Pat',
 'THE ANXIETY: WILLOW & Tyler Cole',
 'Lil Nas X',
 'Ryan Hurd With Maren Morris',
 'Drake Featuring Lil Baby',
 'Swedish House Mafia & The Weeknd',
 'Bryson Gray Featuring Tyson James & Chandler Crump',
 'Lil Nas X',
 'Farruko',
 'CKay',
 'The Weeknd',
 'Billie Eilish',
 'Moneybagg Yo',
 'NEIKED X Mae Muller X Polo G',
 'Jordan Davis Featuring Luke Bryan',
 'Justin Bieber',
 'Loza Alexander',
 'Luke Combs',
 'Tai Verdes',
 'Chris Stapl

In [7]:
# Save names of artists in a csv file
with open('artist_name.csv', 'w') as f:
    for artist in df['artist']:
        f.write(artist)
        f.write('\n')

In [8]:
list(df['date'].values)

['2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-06',
 '2021-11-

In [9]:
# Pandas.to_datetime() syntax
pd.to_datetime(df['date'], errors='raise', dayfirst=False, yearfirst=False, 
    utc=None, format=None, exact=True, unit=None, 
    infer_datetime_format=False, origin='unix', cache=True)

  pd.to_datetime(df['date'], errors='raise', dayfirst=False, yearfirst=False,


0        2021-11-06
1        2021-11-06
2        2021-11-06
3        2021-11-06
4        2021-11-06
            ...    
330082   1958-08-04
330083   1958-08-04
330084   1958-08-04
330085   1958-08-04
330086   1958-08-04
Name: date, Length: 330087, dtype: datetime64[ns]

In [10]:
# Use Datetime.strftime() Method to extract year
df['year'] = df['date'].dt.strftime('%Y')
print(df)

AttributeError: Can only use .dt accessor with datetimelike values