# Data Processing of Spotify information after scraping it from the Website

# Load the csv file

In [1]:
import numpy as np
import pandas as pd

spotify = pd.read_csv('SpotifyStreamingRecords.csv')

spotify.head()

Unnamed: 0,Rank,Song,Artist(s),Streams(billions),Release date
0,1,"""Blinding Lights""",The Weeknd,3.896,29 November 2019
1,2,"""Shape of You""",Ed Sheeran,3.687,6 January 2017
2,3,"""Someone You Loved""",Lewis Capaldi,3.061,8 November 2018
3,4,"""Sunflower""",Post Malone and Swae Lee,3.007,18 October 2018
4,5,"""Dance Monkey""",Tones and I,2.937,10 May 2019


# Data Processing

#

Checking the data type for each column

In [2]:
print(f"Shape of the dataset is {spotify.shape} \n")
print(f"Data types are below where 'object' indicates a string type: ")
print(spotify.dtypes)

Shape of the dataset is (100, 5) 

Data types are below where 'object' indicates a string type: 
Rank                   int64
Song                  object
Artist(s)             object
Streams(billions)    float64
Release date          object
dtype: object


#

Checking to see if there is any empty cell

In [3]:
print(f"\nNumber of missing values for each feature:")
print(spotify.isnull().sum())


Number of missing values for each feature:
Rank                 0
Song                 0
Artist(s)            0
Streams(billions)    0
Release date         0
dtype: int64


### Removing the quotation marks from the Song title

In [4]:
spotify['Song'] = spotify['Song'].str.replace('[^a-zA-Z0-9\s]', '', regex=True)
spotify.head()

Unnamed: 0,Rank,Song,Artist(s),Streams(billions),Release date
0,1,Blinding Lights,The Weeknd,3.896,29 November 2019
1,2,Shape of You,Ed Sheeran,3.687,6 January 2017
2,3,Someone You Loved,Lewis Capaldi,3.061,8 November 2018
3,4,Sunflower,Post Malone and Swae Lee,3.007,18 October 2018
4,5,Dance Monkey,Tones and I,2.937,10 May 2019


### Convert and round to two decimal places for the Streams(billions) column

In [5]:
spotify['Streams(billions)'] = spotify['Streams(billions)'].astype(float).round(2)
spotify.head()

Unnamed: 0,Rank,Song,Artist(s),Streams(billions),Release date
0,1,Blinding Lights,The Weeknd,3.9,29 November 2019
1,2,Shape of You,Ed Sheeran,3.69,6 January 2017
2,3,Someone You Loved,Lewis Capaldi,3.06,8 November 2018
3,4,Sunflower,Post Malone and Swae Lee,3.01,18 October 2018
4,5,Dance Monkey,Tones and I,2.94,10 May 2019


## Adjusted the column names

In [6]:
spotify.rename(columns={"Artist(s)": "Artist_s", "Streams(billions)": "Streams_billions","Release date":"Release_date"},inplace=True)
spotify.head()

Unnamed: 0,Rank,Song,Artist_s,Streams_billions,Release_date
0,1,Blinding Lights,The Weeknd,3.9,29 November 2019
1,2,Shape of You,Ed Sheeran,3.69,6 January 2017
2,3,Someone You Loved,Lewis Capaldi,3.06,8 November 2018
3,4,Sunflower,Post Malone and Swae Lee,3.01,18 October 2018
4,5,Dance Monkey,Tones and I,2.94,10 May 2019


In [7]:
import pandas as pd

spotify.to_csv('SpotifyStreamingRecords_1.csv', index=False)