In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
import matplotlib.pyplot as plt

# Load your Spotify dataset
# Replace 'your_dataset.csv' with the actual file path or URL to your dataset
df = pd.read_csv('/kaggle/input/spotify-1million-tracks/spotify_data.csv')


In [None]:
#getting overview of data
df.head()

In [None]:
#getting number of rows and columns
df.shape

In [None]:
# Get the total number of unique artists
total_artists = df['artist_name'].nunique()

print(f'Total number of unique artists: {total_artists}')

In [None]:
column_to_drop = 'Unnamed: 0'
df = df.drop(column_to_drop, axis=1)

In [None]:
df.head()

In [None]:
# Convert the 'year' column to datetime if it's not already
df['year'] = pd.to_datetime(df['year'], format='%Y').dt.year

def genre_analysis_and_trends():
    # Group the data by 'year' and 'genre' and count the number of songs
    genre_counts = df.groupby(['year', 'genre']).size().unstack(fill_value=0)

    # Create a stacked area chart to visualize genre distribution over the years
    genre_counts.plot(kind='area', stacked=True, figsize=(12, 6), colormap='Set3')
    
    plt.title('Genre Distribution Over the Years')
    plt.xlabel('Year')
    plt.ylabel('Number of Songs')
    plt.legend(title='Genre', bbox_to_anchor=(1.05, 1), loc='upper left')

    plt.tight_layout()
    plt.show()

# Example usage:
genre_analysis_and_trends()


In [None]:
import seaborn as sns

# Select the features and popularity column
features = ['danceability', 'energy', 'valence']
popularity_column = 'popularity'

def visualize_popularity_vs_features():
    # Create a pairplot to visualize pairwise relationships between features and popularity
    sns.pairplot(df, vars=features + [popularity_column], kind='scatter', diag_kind='kde', height=3)
    
    plt.suptitle('Popularity vs. Song Features', y=1.02)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_popularity_vs_features()


In [None]:
def time_signature_trends():
    # Group the data by 'year' and 'time_signature' and count the number of songs
    time_signature_counts = df.groupby(['year', 'time_signature']).size().unstack(fill_value=0)

    # Create a line chart to visualize time signature trends over the years
    time_signature_counts.plot(kind='line', figsize=(12, 6))
    
    plt.title('Time Signature Trends Over the Years')
    plt.xlabel('Year')
    plt.ylabel('Number of Songs')
    plt.legend(title='Time Signature', loc='upper right')

    plt.tight_layout()
    plt.show()

# Example usage:
time_signature_trends()


In [None]:
# Select the columns for energy and danceability
energy_column = 'energy'
danceability_column = 'danceability'

def visualize_energy_vs_danceability():
    # Add jitter to the data points to spread them out
    jitter = 0.02  # Adjust the jitter value as needed
    jittered_energy = df[energy_column] + np.random.uniform(-jitter, jitter, len(df))
    jittered_danceability = df[danceability_column] + np.random.uniform(-jitter, jitter, len(df))

    # Create a scatter plot to explore the relationship between energy and danceability
    plt.figure(figsize=(8, 6))
    plt.scatter(jittered_energy, jittered_danceability, alpha=0.5)
    
    plt.title('Energy vs. Danceability')
    plt.xlabel('Energy')
    plt.ylabel('Danceability')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_energy_vs_danceability()


In [None]:
# Select the columns for acousticness and popularity
acousticness_column = 'acousticness'
popularity_column = 'popularity'

def visualize_acousticness_vs_popularity():
    # Scatter plot
    plt.figure(figsize=(8, 6))
    plt.scatter(df[acousticness_column], df[popularity_column], alpha=0.5)
    
    plt.title('Acousticness vs. Popularity')
    plt.xlabel('Acousticness')
    plt.ylabel('Popularity')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Box plot
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[acousticness_column], y=df[popularity_column], palette='Set3')
    
    plt.title('Acousticness vs. Popularity')
    plt.xlabel('Acousticness')
    plt.ylabel('Popularity')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_acousticness_vs_popularity()


In [None]:
import seaborn as sns
from scipy.stats import norm

# Select the column for loudness
loudness_column = 'loudness'

def visualize_loudness_distribution():
    # Create a histogram of loudness
    plt.figure(figsize=(8, 6))
    sns.histplot(df[loudness_column], kde=True, color='skyblue', bins=30, label='Loudness Distribution')
    
    # Fit a normal distribution to the data
    mu, std = norm.fit(df[loudness_column])
    xmin, xmax = plt.xlim()
    x = np.linspace(xmin, xmax, 100)
    p = norm.pdf(x, mu, std)
    
    # Overlay the normal distribution curve on the histogram
    plt.plot(x, p, 'k', linewidth=2, label=f'Fit results: $\mu$ = {mu:.2f}, $\sigma$ = {std:.2f}')
    
    plt.title('Loudness Distribution')
    plt.xlabel('Loudness')
    plt.ylabel('Frequency')
    plt.legend()
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_loudness_distribution()

In [None]:
# Select the column for the key
key_column = 'key'

def key_analysis():
    # Count the number of songs in each key
    key_counts = df[key_column].value_counts().sort_index()
    
    # Create a bar chart to visualize the prevalence of each key
    plt.figure(figsize=(10, 6))
    sns.barplot(x=key_counts.index, y=key_counts.values, palette='Set3')
    
    plt.title('Key Analysis')
    plt.xlabel('Key')
    plt.ylabel('Number of Songs')
    plt.xticks(ticks=range(12), labels=['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B'])
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
key_analysis()


In [None]:
# Select the column for mode
mode_column = 'mode'

def mode_analysis_pie_chart():
    # Count the number of songs in major and minor modes
    mode_counts = df[mode_column].value_counts()
    
    # Create a pie chart to visualize the distribution of major and minor songs
    plt.figure(figsize=(6, 6))
    plt.pie(mode_counts, labels=['Major', 'Minor'], autopct='%1.1f%%', colors=['skyblue', 'lightcoral'], startangle=90)
    
    plt.title('Mode Analysis (Pie Chart)')
    plt.axis('equal')  # Equal aspect ratio ensures that pie is drawn as a circle
    
    plt.tight_layout()
    plt.show()

# Example usage:
mode_analysis_pie_chart()


In [None]:
# Select the column for mode
mode_column = 'mode'

def mode_analysis_bar_chart():
    # Count the number of songs in major and minor modes
    mode_counts = df[mode_column].value_counts()
    
    # Create a bar chart to visualize the distribution of major and minor songs
    plt.figure(figsize=(6, 6))
    sns.barplot(x=mode_counts.index, y=mode_counts.values, palette=['skyblue', 'lightcoral'])
    
    plt.title('Mode Analysis (Bar Chart)')
    plt.xlabel('Mode')
    plt.ylabel('Number of Songs')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
mode_analysis_bar_chart()


In [None]:
# Select the columns for speechiness and instrumentalness
speechiness_column = 'speechiness'
instrumentalness_column = 'instrumentalness'

def visualize_speechiness_vs_instrumentalness():
    # Create a scatter plot to analyze the trade-off between speechiness and instrumentalness
    plt.figure(figsize=(8, 6))
    plt.scatter(df[speechiness_column], df[instrumentalness_column], alpha=0.5)
    
    plt.title('Speechiness vs. Instrumentalness')
    plt.xlabel('Speechiness')
    plt.ylabel('Instrumentalness')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_speechiness_vs_instrumentalness()


In [None]:
# Select the columns for year and the features you want to analyze (e.g., tempo, loudness)
year_column = 'year'
features_columns = ['tempo', 'loudness']

def visualize_temporal_trends():
    # Create line charts to visualize the temporal trends of the selected features
    plt.figure(figsize=(12, 6))

    for feature in features_columns:
        sns.lineplot(data=df, x=year_column, y=feature, label=feature)

    plt.title('Temporal Trends of Song Features')
    plt.xlabel('Year')
    plt.ylabel('Value')
    plt.legend(title='Feature')

    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_temporal_trends()


In [None]:
# Select the column for duration_ms
duration_column = 'duration_ms'

def visualize_duration_distribution_histogram():
    # Create a histogram to analyze the distribution of song durations
    plt.figure(figsize=(8, 6))
    plt.hist(df[duration_column], bins=30, color='skyblue', edgecolor='black')
    
    plt.title('Duration Distribution (Histogram)')
    plt.xlabel('Duration (ms)')
    plt.ylabel('Frequency')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_duration_distribution_histogram()


In [None]:
# Select the column for duration_ms
duration_column = 'duration_ms'

def visualize_duration_distribution_box_plot():
    # Create a box plot to analyze the distribution of song durations
    plt.figure(figsize=(8, 6))
    sns.boxplot(x=df[duration_column], color='lightcoral')
    
    plt.title('Duration Distribution (Box Plot)')
    plt.xlabel('Duration (ms)')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_duration_distribution_box_plot()


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


# Select the columns/features you want to use for clustering
# For example, let's use 'danceability' and 'energy' as features
features = ['liveness', 'popularity']

def perform_cluster_analysis(num_clusters):
    # Select the specified features from the dataset
    data = df[features]

    # Standardize the features (important for K-means)
    scaler = StandardScaler()
    data_scaled = scaler.fit_transform(data)

    # Perform K-means clustering
    kmeans = KMeans(n_clusters=num_clusters, random_state=42)
    df['cluster'] = kmeans.fit_predict(data_scaled)

    # Create scatter plots to visualize the clusters
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x=features[0], y=features[1], hue='cluster', palette='viridis', s=60)
    
    plt.title('Cluster Analysis (K-means)')
    plt.xlabel(features[0])
    plt.ylabel(features[1])
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
# Set the desired number of clusters (e.g., 4)
perform_cluster_analysis(4)


In [None]:
pip install pandas plotly dash


In [None]:
# Select the columns for year and duration_ms
year_column = 'year'
duration_column = 'duration_ms'

def visualize_duration_vs_year():
    # Group the data by year and calculate the average duration for each year
    year_duration = df.groupby(year_column)[duration_column].mean()
    
    # Create a line chart to visualize the trend of song durations over the years
    plt.figure(figsize=(10, 6))
    sns.lineplot(x=year_duration.index, y=year_duration.values, color='skyblue')
    
    plt.title('Song Duration vs. Year')
    plt.xlabel('Year')
    plt.ylabel('Average Duration (ms)')
    
    plt.grid(True)
    plt.tight_layout()
    plt.show()

# Example usage:
visualize_duration_vs_year()


In [None]:
pip install pandas matplotlib wordcloud


In [None]:
from wordcloud import WordCloud

# Select the column for genre
genre_column = 'genre'

def generate_genre_wordcloud():
    # Combine all genre names into a single string
    genre_text = ' '.join(df[genre_column].dropna())

    # Generate a word cloud of genre names
    wordcloud = WordCloud(width=800, height=400, background_color='white').generate(genre_text)

    # Create a plot to display the word cloud
    plt.figure(figsize=(10, 6))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    
    plt.title('Genre Word Cloud')
    plt.tight_layout()
    
    plt.show()

# Example usage:
generate_genre_wordcloud()


In [None]:
pip install pandas folium


In [None]:
# Select the columns for genre and the features you want to compare (e.g., danceability, energy)
genre_column = 'genre'
features_columns = ['danceability', 'energy']

def create_feature_distribution_by_genre():
    # Create a side-by-side box plot for each feature by genre
    plt.figure(figsize=(12, 6))
    
    for feature in features_columns:
        sns.boxplot(data=df, x=genre_column, y=feature, width=0.6)
    
    plt.title('Feature Distribution by Genre')
    plt.xlabel('Genre')
    plt.ylabel('Feature Value')
    
    plt.xticks(rotation=45)  # Rotate genre labels for readability
    
    plt.grid(True)
    plt.tight_layout()
    
    plt.show()

# Example usage:
create_feature_distribution_by_genre()


In [None]:
import seaborn as sns

# Select the columns for artist, popularity, and release year
artist_column = 'artist_name'
popularity_column = 'popularity'
year_column = 'year'

# Specify the artist you want to analyze
artist_to_analyze = 'ArtistName'

def artist_insights(artist_name):
    # Filter the dataset for the selected artist
    artist_data = df[df[artist_column] == artist_name]

    # Create a line chart to visualize the artist's popularity over time
    plt.figure(figsize=(12, 6))
    sns.lineplot(x=artist_data[year_column], y=artist_data[popularity_column], marker='o', linestyle='-')
    
    plt.title(f'{artist_name} Popularity Over Time')
    plt.xlabel('Year')
    plt.ylabel('Popularity')
    plt.grid(True)
    plt.tight_layout()
    plt.show()

    # Display the artist's discography
    discography = artist_data['track_name'].unique()
    print(f"{artist_name}'s Discography:")
    for track in discography:
        print(f"- {track}")

    # Analyze genre evolution (if genre information is available)
    if 'genre' in df.columns:
        genre_data = artist_data['genre'].value_counts()
        print(f"\n{artist_name}'s Genre Evolution:")
        print(genre_data)

user_artist = input('Enter the Name of Artist: ')
artist_insights(user_artist)
