In [None]:
#Importing necessary libraries
import pandas as pd

In [None]:
#Reading the csv
pitch = pd.read_csv('pitchfork.csv',sep=',')

In [None]:
pitch.head()

Unnamed: 0,artist,album,genre,score,date,author,role,review,bnm,link,label,release_year
0,David Byrne,“…The Best Live Show of All Time” — NME EP,Rock,5.5,January 11 2019,Andy Beta,Contributor,"Viva Brother, Terris, Mansun, the Twang, Joe L...",0,https://pitchfork.com/reviews/albums/david-byr...,Nonesuch,2018.0
1,DJ Healer,Lost Lovesongs / Lostsongs Vol. 2,Electronic,6.2,January 11 2019,Chal Ravens,Contributor,"The Prince of Denmark—that is, the proper prin...",0,https://pitchfork.com/reviews/albums/dj-healer...,Planet Uterus,2019.0
2,Jorge Velez,Roman Birds,Electronic,7.9,January 10 2019,Philip Sherburne,Contributing Editor,"Jorge Velez has long been prolific, but that’s...",0,https://pitchfork.com/reviews/albums/jorge-vel...,Self-released,2019.0
3,Chandra,Transportation EPs,Rock,7.8,January 10 2019,Andy Beta,Contributor,When the Avalanches returned in 2016 after an ...,0,https://pitchfork.com/reviews/albums/chandra-t...,Telephone Explosion,2018.0
4,The Chainsmokers,Sick Boy,Electronic,3.1,January 9 2019,Larry Fitzmaurice,Contributor,We’re going to be stuck with the Chainsmokers ...,0,https://pitchfork.com/reviews/albums/the-chain...,"Disruptor,Columbia",2018.0


In [None]:
pitch.shape

(20873, 12)

In [None]:
pitch=pitch[:5000].copy()

In [None]:
#Preprocessing
def check_df(dataframe, head=5):
    print(" SHAPE ".center(70,'-'))
    print('Rows: {}'.format(dataframe.shape[0]))
    print('Columns: {}'.format(dataframe.shape[1]))
    print(" TYPES ".center(70,'-'))
    print(dataframe.dtypes)
    print(" MISSING VALUES ".center(70,'-'))
    print(dataframe.isnull().sum())
    print(" DUPLICATED VALUES ".center(70,'-'))
    print(dataframe.duplicated().sum())
    print(" DESCRIBE ".center(70,'-'))
    print(dataframe.describe().T)

check_df(pitch)

------------------------------- SHAPE --------------------------------
Rows: 5000
Columns: 12
------------------------------- TYPES --------------------------------
artist           object
album            object
genre            object
score           float64
date             object
author           object
role             object
review           object
bnm               int64
link             object
label            object
release_year    float64
dtype: object
--------------------------- MISSING VALUES ---------------------------
artist            1
album             1
genre           209
score             0
date              0
author            0
role             36
review            0
bnm               0
link              0
label             5
release_year      3
dtype: int64
------------------------- DUPLICATED VALUES --------------------------
0
------------------------------ DESCRIBE ------------------------------
               count         mean       std     min     25%     5

In [None]:
missing_values =pitch.isnull().sum()
missing_values

artist            1
album             1
genre           209
score             0
date              0
author            0
role             36
review            0
bnm               0
link              0
label             5
release_year      3
dtype: int64

In [None]:
pitch.fillna('', inplace=True)
pitch.isnull().sum()

artist          0
album           0
genre           0
score           0
date            0
author          0
role            0
review          0
bnm             0
link            0
label           0
release_year    0
dtype: int64

In [None]:
#Feature engineering
from datetime import datetime

pitch['date'] = pd.to_datetime(pitch['date'])
pitch['review year'] = pitch['date'].dt.year
pitch.drop('date', axis=1, inplace=True)

In [None]:
pitch.head()

Unnamed: 0,artist,album,genre,score,author,role,review,bnm,link,label,release_year,review year
0,David Byrne,“…The Best Live Show of All Time” — NME EP,Rock,5.5,Andy Beta,Contributor,"Viva Brother, Terris, Mansun, the Twang, Joe L...",0,https://pitchfork.com/reviews/albums/david-byr...,Nonesuch,2018.0,2019
1,DJ Healer,Lost Lovesongs / Lostsongs Vol. 2,Electronic,6.2,Chal Ravens,Contributor,"The Prince of Denmark—that is, the proper prin...",0,https://pitchfork.com/reviews/albums/dj-healer...,Planet Uterus,2019.0,2019
2,Jorge Velez,Roman Birds,Electronic,7.9,Philip Sherburne,Contributing Editor,"Jorge Velez has long been prolific, but that’s...",0,https://pitchfork.com/reviews/albums/jorge-vel...,Self-released,2019.0,2019
3,Chandra,Transportation EPs,Rock,7.8,Andy Beta,Contributor,When the Avalanches returned in 2016 after an ...,0,https://pitchfork.com/reviews/albums/chandra-t...,Telephone Explosion,2018.0,2019
4,The Chainsmokers,Sick Boy,Electronic,3.1,Larry Fitzmaurice,Contributor,We’re going to be stuck with the Chainsmokers ...,0,https://pitchfork.com/reviews/albums/the-chain...,"Disruptor,Columbia",2018.0,2019


In [None]:
artist_features = pitch.groupby('artist').agg({'score': 'mean'})
artist_features

Unnamed: 0_level_0,score
artist,Unnamed: 1_level_1
,7.5
Gia Margaret,7.4
JPEGMAFIA,7.7
!!!,7.1
(Sandy) Alex G,8.4
...,...
µ-Ziq,5.8
Âme,4.8
Çaykh,7.3
тпсб,7.8


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(pitch['genre'])
tfidf_matrix.shape

(5000, 10)

In [None]:
from sklearn.metrics.pairwise import linear_kernel
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [None]:
indices = pd.Series(pitch.index, index=pitch['artist'])

In [None]:
def recommend_artists(artist, cosine_sim=cosine_sim):
    idx = indices[artist]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    artist_indices = [i[0] for i in sim_scores]
    return pitch['artist'].iloc[artist_indices]

In [None]:
#Using the recommender over an Artist
recommendations = recommend_artists('Jorge Velez')
recommendations

2          Jorge Velez
4     The Chainsmokers
5       Silent Servant
14             Hammock
28        Scott Hirsch
29       The 7th Plain
64             Waajeed
65           The Samps
74           Daft Punk
78               BEAST
Name: artist, dtype: object

should we make all artists as one artist or keep them seperate cause they may represent other albums of theirs

In [None]:
#Checking why the artists are similar in terms of genre
pitch1 = pitch.loc[pitch['artist'] == 'Jorge Velez']
pitch1_genre = pitch1['genre'].values[0]
pitch1_genre

'Electronic'

In [None]:
pitch2 = pitch.loc[pitch['artist'] == 'The Chainsmokers']
pitch2_genre = pitch2['genre'].values[0]
pitch2_genre

'Electronic'