In [4]:
import pandas as pd
import numpy as np

In [5]:
df = pd.read_csv('/content/dataset.csv', on_bad_lines='skip')
df

In [6]:
df.info()

In [None]:
df.drop(['Unnamed: 0','track_id'], axis=1, inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.dropna(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.duplicated().sum()

In [None]:
df.isna().sum()

In [None]:
df['track_genre'].unique()

In [None]:
df['track_genre'].nunique()

In [None]:
df['track_genre'].value_counts()

# Genre Discription

**Acoustic**: Music that primarily uses acoustic instruments (non-electric) and emphasizes natural sound.

**Afrobeat**: A genre that originated in West Africa, characterized by a fusion of traditional African rhythms, jazz, highlife, and funk.

**Alt-Rock (Alternative Rock)**: A broad term for rock music that doesn't fit the conventional norms, often incorporating elements from various genres.

**Alternative**: A catch-all term for music that falls outside mainstream pop and rock, often characterized by unconventional sounds and structures.

**Ambient**: Music that creates an atmospheric and abstract sound, often used for relaxation or background purposes.

**Anime**: Music associated with Japanese animation, spanning various genres from pop to rock to electronic.

**Black Metal**: A subgenre of heavy metal characterized by its dark, raw sound, shrieking vocals, and often lyrical themes related to Satanism or the occult.

**Cantopop**: Popular music from Hong Kong that blends Cantonese lyrics with various musical styles.

**Chicago House**: A subgenre of house music that originated in Chicago, known for its repetitive beats and use of synthesizers.

**Children:** Music specifically created for or enjoyed by children, often featuring simple melodies and educational content.

**Chill:** A genre focused on creating a relaxed and calming atmosphere, often with slow tempos and soothing sounds.

**Classical:** A genre that encompasses a wide range of music from the Western classical tradition, often characterized by orchestral arrangements and complex compositions.

**Club: Music** specifically designed for dance clubs, often associated with electronic dance music (EDM).

Bluegrass: A genre of American roots music that originated in the Appalachian region, characterized by the use of acoustic instruments like banjo and fiddle.

Blues: A genre that originated in African-American communities, characterized by its expressive lyrics and typically a 12-bar structure.

Brazil: Music from Brazil, which encompasses a diverse range of styles including samba, bossa nova, and more.

Breakbeat: A genre of electronic music with beats that emphasize drum breaks, often sampled from funk or jazz records.

British: A broad term referring to music from the United Kingdom, encompassing various genres.

Comedy: Music that is intentionally humorous, often featuring comedic lyrics and elements.

Country: A genre of American popular music that originated in the Southern United States, often characterized by storytelling lyrics and the use of acoustic instruments.

Dance: A broad category of music designed for dancing, often associated with electronic and pop genres.

Dancehall: A genre of Jamaican popular music that evolved from reggae, known for its rhythmic style and deejay (toasting) vocals.

Death Metal: A subgenre of heavy metal known for its aggressive and distorted guitar sound, growling vocals, and often dark lyrical themes.

Deep House: A subgenre of house music known for its soulful and atmospheric sound, often characterized by slower tempos and a focus on melody.





In [None]:
df.describe()

In [None]:
df['track_genre'].nunique()

In [None]:
df['track_genre'].value_counts()

In [None]:
df

# visualization

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Count the occurrences of each unique track genre
genre_counts = df['track_genre'].value_counts()

In [None]:
plt.figure(figsize=(15, 8))
sns.barplot(x=genre_counts.index, y=genre_counts.values, palette='viridis')
plt.title('Distribution of Unique Track Genres')
plt.xlabel('Track Genre')
plt.ylabel('Count')
plt.xticks(rotation=90)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
plt.figure(figsize=(12, 12))
plt.pie(genre_counts, labels=genre_counts.index, autopct='%1.1f%%', startangle=140)
plt.title('Distribution of Unique Track Genres')
plt.show()

In [None]:
# Choose the top N track genres based on average popularity
top_genres = df.groupby('track_genre')['popularity'].mean().sort_values(ascending=False).head(15).index
top_genres

In [None]:
# Filter the DataFrame to include only the top genres
df_top_genres = df[df['track_genre'].isin(top_genres)]

In [None]:
# Create a bar plot for average popularity by track genre
plt.figure(figsize=(15, 5))
sns.barplot(x='popularity', y='track_genre',
            data=df_top_genres, estimator=lambda x: sum(x) / len(x),
            ci=None, order=df_top_genres.groupby('track_genre')['popularity'].mean().sort_values(ascending=False).index,
            palette='viridis')
plt.title('Average Popularity by Track Genre (Top 15)')
plt.xlabel('Average Popularity')
plt.ylabel('Track Genre')
plt.show()

In [7]:
plt.figure(figsize=(15,6))
plt.subplot(211)
sns.boxplot(x="track_genre", y="loudness", data=df_top_genres)
plt.subplot(212)
sns.boxplot(x="track_genre", y="speechiness", data=df_top_genres)
plt.show()

In [None]:
# Count the occurrences of explicit and non-explicit songs
explicit_counts = df['explicit'].value_counts()

# Create a pie chart
plt.figure(figsize=(8, 8))
plt.pie(explicit_counts, labels=['Non-Explicit', 'Explicit'], autopct='%1.1f%%', startangle=90, colors=['skyblue', 'salmon'])
plt.title('Distribution of Explicit vs Non-Explicit Songs')
plt.show()


In [None]:
#Heatmap
# Convert the 'explicit' column to numeric for heatmap
df['explicit'] = df['explicit'].astype(int)

# Create a correlation matrix
correlation_matrix = df.corr()

# Set up the matplotlib figure
plt.figure(figsize=(12, 10))

# Create a heatmap with the correlation matrix
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', linewidths=.5)

# Display the plot
plt.show()

In [None]:
df

In [None]:
df1 = df.drop(['artists','album_name'], axis=1)
df1

In [None]:
features = df[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
               'instrumentalness', 'liveness', 'valence', 'explicit']]
features

In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

In [None]:
features_standardized = sc.fit_transform(features)
features_standardized

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(features_standardized, features_standardized)
cosine_sim

In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

# Load the dataset
df = pd.read_csv('your_dataset.csv')  # Replace with your actual file path

# Select relevant features for recommendation
features = df[['danceability', 'energy', 'loudness', 'speechiness', 'acousticness',
               'instrumentalness', 'liveness', 'valence']]

# Standardize the features
scaler = StandardScaler()
features_standardized = scaler.fit_transform(features)

# Calculate cosine similarity between tracks
cosine_sim = cosine_similarity(features_standardized, features_standardized)

# Function to get recommendations based on track index
def get_recommendations(track_index, cosine_sim_matrix, df):
    sim_scores = list(enumerate(cosine_sim_matrix[track_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]  # Exclude the track itself and get top 10 similar tracks

    track_indices = [i[0] for i in sim_scores]

    return df[['artists', 'album_name', 'track_name', 'popularity', 'duration_ms', 'track_genre']].iloc[track_indices]

# Example: Get recommendations for track at index 0
recommendations = get_recommendations(0, cosine_sim, df)
print(recommendations)
