In [79]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import mean_squared_error,r2_score
from sklearn.preprocessing import StandardScaler
import pickle
import math
from scipy.stats import skew
from sklearn.metrics.pairwise import cosine_similarity

# Classification
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier

In [80]:
import warnings

# Disable all warnings
warnings.filterwarnings("ignore")

In [81]:
# Read the parquet file
parquet_file_path = r"C:\Guvi\Project\Final Project 1\0000 (1).parquet"

# Read the Parquet file into a pandas DataFrame
df = pd.read_parquet(parquet_file_path)

# Display the DataFrame
df.head(1)

Unnamed: 0.1,Unnamed: 0,track_id,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,...,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,0,5SuOikwiRyPMVoIQDJUgSV,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,...,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic


In [82]:
df = df.drop(['Unnamed: 0','track_id',],axis=1)

In [83]:
sv_popularity = skew(df['popularity'])
sv_duration_ms = skew(df['duration_ms'])
sv_danceability = skew(df['danceability'])
sv_energy = skew(df['energy'])
sv_loudness = skew(df['loudness'])
sv_speechiness = skew(df['speechiness'])
sv_acousticness = skew(df['acousticness'])
sv_instrumentalness = skew(df['instrumentalness'])
sv_liveness = skew(df['liveness'])
sv_valence = skew(df['valence'])
sv_tempo = skew(df['tempo'])

sv = [sv_popularity, sv_duration_ms, sv_danceability, sv_energy, sv_loudness, sv_speechiness, sv_acousticness, sv_instrumentalness, sv_liveness, sv_valence, sv_tempo]

for i, skewness_value in zip(sv, ['popularity', 'duration_ms', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']):
    print(f"The skewness of {skewness_value} is: {i}")

The skewness of popularity is: 0.04640190527012667
The skewness of duration_ms is: 11.195034174569583
The skewness of danceability is: -0.39949137696049875
The skewness of energy is: -0.5969935617097103
The skewness of loudness is: -2.0065155317630916
The skewness of speechiness is: 4.647454866076131
The skewness of acousticness is: 0.7272852910175864
The skewness of instrumentalness is: 1.734383371462091
The skewness of liveness is: 2.105710409930272
The skewness of valence is: 0.11507652786076844
The skewness of tempo is: 0.23229180402927552


In [84]:
df.head(5)

Unnamed: 0,artists,album_name,track_name,popularity,duration_ms,explicit,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,track_genre
0,Gen Hoshino,Comedy,Comedy,73,230666,False,0.676,0.461,1,-6.746,0,0.143,0.0322,1e-06,0.358,0.715,87.917,4,acoustic
1,Ben Woodward,Ghost (Acoustic),Ghost - Acoustic,55,149610,False,0.42,0.166,1,-17.235,1,0.0763,0.924,6e-06,0.101,0.267,77.489,4,acoustic
2,Ingrid Michaelson;ZAYN,To Begin Again,To Begin Again,57,210826,False,0.438,0.359,0,-9.734,1,0.0557,0.21,0.0,0.117,0.12,76.332,4,acoustic
3,Kina Grannis,Crazy Rich Asians (Original Motion Picture Sou...,Can't Help Falling In Love,71,201933,False,0.266,0.0596,0,-18.515,1,0.0363,0.905,7.1e-05,0.132,0.143,181.74,3,acoustic
4,Chord Overstreet,Hold On,Hold On,82,198853,False,0.618,0.443,2,-9.681,1,0.0526,0.469,0.0,0.0829,0.167,119.949,4,acoustic


In [85]:
# Standardize numerical features
numerical_features = ['key', 'mode', 'popularity', 'danceability', 'valence', 'tempo', 'time_signature', 'duration_ms',
                      'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness']
scaler = StandardScaler()
df[numerical_features] = scaler.fit_transform(df[numerical_features])

In [72]:
# Columns to be label encoded
columns_to_encode = ['artists', 'album_name', 'explicit', 'track_name']
# Initialize LabelEncoders for each column
label_encoders = {}
for column in columns_to_encode:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

In [88]:
# Function to recommend tracks based on user preferences
def recommend_tracks(user_preferences, df):
    user_df = pd.DataFrame([user_preferences])
    user_df[numerical_features] = scaler.transform(user_df[numerical_features])
    
    # Calculate cosine similarity between user preferences and tracks
    similarity_matrix = cosine_similarity(user_df[numerical_features], df[numerical_features])
    
    # Get indices of tracks sorted by similarity
    track_indices = similarity_matrix.argsort()[0][::-1]
    
    # Recommend top tracks
    recommendations = df.iloc[track_indices[:5]]
    
    return recommendations[['artists', 'track_name', 'album_name']]

In [93]:
# Example user preferences
user_preferences = {
    'danceability': 0.6,
    'valence': 0.8,
    'tempo': 100,
    'energy': 0.5,
    'loudness': .992,
    'key': 5,
    'mode': 3,
    'popularity': 34,
    'time_signature': 4,
    'duration_ms': 280078,
    'speechiness': .23,
    'acousticness': .98,
    'instrumentalness': 6.4,
    'liveness': .2
}

# Recommend tracks based on user preferences
recommendations = recommend_tracks(user_preferences, df)

df_rec = pd.DataFrame(recommendations)
# Display recommendations
df_rec

Unnamed: 0,artists,track_name,album_name
111562,Air,Mer du Japon,Pocket Symphony
7205,Nefesh Mountain;Sam Bush;Jerry Douglas,Big Mountain,Songs For The Sparrows
107458,Trevor Something,Enjoy the Silence,Enjoy the Silence
38352,No Vacation,Estrangers,Estrangers
38704,Good Morning,On The Street,On The Street / You
