# Initial analysis of the data

Dataset: [Spotify and Youtube](https://www.kaggle.com/datasets/salvatorerastelli/spotify-and-youtube?resource=download)

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Create pandas dataframe from csv file
df = pd.read_csv('source_dataset/Spotify_Youtube.csv')

In [3]:
# Shape of dataframe
print(df.shape)

(20718, 28)


In [4]:
# Convert Unamed column to index and rename to 'id'
df = df.rename(columns={'Unnamed: 0': 'id'})
df = df.set_index('id')

In [5]:
# Display the column names
print("\nColumn names:")
print(df.columns)


Column names:
Index(['Artist', 'Url_spotify', 'Track', 'Album', 'Album_type', 'Uri',
       'Danceability', 'Energy', 'Key', 'Loudness', 'Speechiness',
       'Acousticness', 'Instrumentalness', 'Liveness', 'Valence', 'Tempo',
       'Duration_ms', 'Url_youtube', 'Title', 'Channel', 'Views', 'Likes',
       'Comments', 'Description', 'Licensed', 'official_video', 'Stream'],
      dtype='object')


In [None]:
# Display the first 5 rows
print("\nFirst 5 rows:")
print(df.head())

In [6]:
# Display the data types of each column
print("\nData types:")
print(df.dtypes)


Data types:
Artist               object
Url_spotify          object
Track                object
Album                object
Album_type           object
Uri                  object
Danceability        float64
Energy              float64
Key                 float64
Loudness            float64
Speechiness         float64
Acousticness        float64
Instrumentalness    float64
Liveness            float64
Valence             float64
Tempo               float64
Duration_ms         float64
Url_youtube          object
Title                object
Channel              object
Views               float64
Likes               float64
Comments            float64
Description          object
Licensed             object
official_video       object
Stream              float64
dtype: object


In [7]:
print("\nNumber of unique values:")
print(df.nunique())
# Drop rows with missing values in 'Likes' or 'Views' columns
df = df.dropna(subset=['Likes', 'Views'])


Number of unique values:
Artist               2079
Url_spotify          2079
Track               17841
Album               11937
Album_type              3
Uri                 18862
Danceability          898
Energy               1268
Key                    12
Loudness             9417
Speechiness          1303
Acousticness         3138
Instrumentalness     4012
Liveness             1536
Valence              1293
Tempo               15024
Duration_ms         14690
Url_youtube         18154
Title               18146
Channel              6714
Views               19245
Likes               17939
Comments            10485
Description         17395
Licensed                2
official_video          2
Stream              18461
dtype: int64


Create popularity categories

In [None]:

# Drop rows with missing values in 'Likes' or 'Views' columns
df = df.dropna(subset=['Likes', 'Views'])

# Calculate normalized popularity score
normalized_likes = (df['Likes'] - df['Likes'].min()) / (df['Likes'].max() - df['Likes'].min())
normalized_views = (df['Views'] - df['Views'].min()) / (df['Views'].max() - df['Views'].min())

# Assign weights to normalized likes and views
likes_weight = 0.7
views_weight = 0.3

# Calculate popularity score
popularity_score = (normalized_likes * likes_weight) + (normalized_views * views_weight)

# Define popularity class thresholds
popularity_thresholds = np.percentile(popularity_score, [0, 30, 70, 90])

# Assign popularity class based on popularity score
popularity_labels = ['Low Popularity', 'Moderate Popularity', 'High Popularity', 'Very High Popularity']
popularity = np.select(
    [popularity_score <= popularity_thresholds[1],
     popularity_score <= popularity_thresholds[2],
     popularity_score <= popularity_thresholds[3],
     popularity_score > popularity_thresholds[3]],
    popularity_labels, default=popularity_labels[-1])

# Add new columns to the DataFrame
df['Popularity Score'] = popularity_score
df['Popularity'] = popularity

In [None]:
# Display the data types of each column
print("\nData types:")
print(df.dtypes)

In [None]:
print(df['Popularity'].value_counts())

In [None]:
# Download the dataframe as a csv file
df.to_csv('datasets/Spotify_YoutubeClean.csv')