In [447]:
import pandas as pd
import numpy as np
import random as rn
import math




In [448]:
data = pd.read_csv('spotify-yt-fixed.csv')


In [449]:
class DataProcessing:
    @staticmethod
    def shuffle(x):
        for i in range(len(x) - 1, 0, -1):
            j = rn.randint(0, i - 1)
            x.iloc[i], x.iloc[j] = x.iloc[j], x.iloc[i]

    @staticmethod
    def normalization(x, names):
        #values = x.select_dtypes(exclude="object")
        columnNames = names
        for column in columnNames:
            data = x.loc[:, column]
            max1 = max(data)
            min1 = min(data)
            for row in range(0, len(x), 1):
                xprim = (x.at[row, column] - min1) / (max1 - min1)
                x.at[row, column] = xprim

    @staticmethod
    def getRating(x):
        lis = [0] * len(x)
        i = 0
        for index, row in x.iterrows():
            lis[i] +=  math.sqrt(row['Views']) + 2 * row['Likes'] / (row['Views']+1) + (row['Comments'] * 0.5) / (row['Views'] +1)
            i+=1
        return lis

    @staticmethod
    def split(x, k):  # k = 0.7, 70% do treningowego
        splitPoint = int(len(x) * k)
        return x.iloc[0:splitPoint], x.iloc[splitPoint:]

    @staticmethod
    def getDistances(x, newObj, columnNames, function, power):
        lis = [0] * len(x)
        i = 0
        for index, row in x.iterrows():
            for column in columnNames:
                lis[i] +=(function(row[column], newObj[column], power))
            i+=1
        return lis

    @staticmethod
    def manhattan(first, second, n):
        return abs(first - second)
                
    @staticmethod
    def euclides(first, second, n):
        return math.pow(math.pow(first, n) - math.pow(second, n), 1/n)

    @staticmethod
    def sort(x, lis):
        x['distance'] = lis
        return x.sort_values(by=['distance'])


    @staticmethod
    def isBanger(x, bar):
        lis = [0]*len(x)
        i = 0
        for index, row in x.iterrows():
            if row['Rating'] >= bar:
                lis[i] = 1
            i+=1
        return lis
            


    @staticmethod
    def NaiveBayes(x, sample, classCol, columnNames):
        classes = x[classCol].unique().tolist()
        
        res = {}
        for var in classes:
            res[var] = []
            for cl in columnNames:
                values = x.loc[x[classCol] == var, cl]
                mean = values.mean()
                sigm2 = values.std()**2
                if sigm2 == 0:
                    print(sigm2)
                    sigm2=0.00000001
                
                res[var].append(DataProcessing.gauss(sample[cl], mean, sigm2))
            res[var] = 1 / len(classes) * np.prod(res[var])
        return max(res, key=res.get)

    @staticmethod
    def bayes(x, sample, classCol, colNames):
        classes = x[classCol].unique().tolist()
        res = {}
        lis = []
        for i in classes:
            lis.append(x[x[classCol]==i])

        for var in lis:
            res[str(var[classCol][0])] = []
            for cl in colNames:
                values = x[cl]
                mean = values.mean()
                sigm2 = values.std()**2
                if sigm2 == 0:
                    sigm2=0.00000001
                
                res[str(var[classCol][0])].append(DataProcessing.gauss(sample[cl], mean, sigm2))
            res[str(var[classCol][0])] = 1 / len(classes) * np.prod(res[str(var[classCol][0])])
        return max(res, key=res.get)
               



    @staticmethod
    def gauss(x, mu, sigm2):
        return (1 / sigm2 * np.sqrt(2 * np.pi)) * np.exp(-0.5 * ((x - mu) / sigm2)**2)

    @staticmethod
    def useBayes(tS, vS):
        counter = 0
        for i in range(0, len(vS)):
            print(DataProcessing.NaiveBayes(tS, vS.iloc[i], 'variety'), vS.iloc[i]['variety'])
            if DataProcessing.NaiveBayes(tS, vS.iloc[i], 'variety') == vS.iloc[i]['variety']:
                counter += 1
        return counter / len(vS)


In [450]:
data = data.sample(frac=1)
data.iloc[0]

Unnamed: 0                        6632
Artist                   Savage Garden
Track               I Knew I Loved You
Danceability                      0.61
Energy                           0.497
Key                                9.0
Loudness                         -9.88
Speechiness                     0.0279
Acousticness                      0.32
Instrumentalness              0.000061
Liveness                        0.0884
Valence                          0.739
Tempo                           85.031
Duration_ms                   250360.0
Views                      204233282.0
Likes                         974735.0
Comments                       38411.0
Name: 6460, dtype: object

In [451]:
DataProcessing.normalization(data,['Loudness', 'Tempo', 'Duration_ms', 'Views', 'Likes', 'Comments'] )
rating = DataProcessing.getRating(data)
data['Rating'] = rating
bar = data.sort_values('Rating', ascending=False).loc[2000, 'Rating']
data['Hit'] = DataProcessing.isBanger(data, bar)
data.drop(labels=['Views', 'Likes', 'Comments', 'Rating', 'Key'], inplace=True, axis=1)
data

Unnamed: 0.1,Unnamed: 0,Artist,Track,Danceability,Energy,Loudness,Speechiness,Acousticness,Instrumentalness,Liveness,Valence,Tempo,Duration_ms,Hit
6460,6632,Savage Garden,I Knew I Loved You,0.610,0.497,0.771046,0.0279,0.32000,0.000061,0.0884,0.739,0.349387,0.047227,1
10899,11243,Kavita Seth,"Tum Hi Ho Bandhu (From ""Cocktail"")",0.714,0.892,0.867249,0.0516,0.00145,0.000817,0.0462,0.899,0.554694,0.054042,1
4478,4607,Boyz II Men,If You Leave Me Now (feat. Boyz II Men),0.362,0.268,0.816052,0.0399,0.90900,0.000000,0.1500,0.394,0.336571,0.045677,1
15742,16270,Joey Bada$$,Righteous Minds,0.809,0.638,0.860804,0.2800,0.26200,0.000000,0.4530,0.608,0.389909,0.041562,1
11919,12289,The Ronettes,Frosty the Snowman,0.382,0.879,0.819720,0.0642,0.22800,0.000000,0.4080,0.559,0.590623,0.022674,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7531,7742,Natalie Cole,Fever,0.671,0.417,0.764071,0.2100,0.40200,0.000000,0.1200,0.331,0.548777,0.038723,1
6223,6393,Vedo,Let's Get Married,0.682,0.584,0.823875,0.2500,0.27700,0.000000,0.0970,0.801,0.543107,0.025137,1
13125,13560,Avicii,Wake Me Up,0.532,0.783,0.859723,0.0523,0.00380,0.001200,0.1610,0.643,0.509837,0.046596,1
15527,16030,Halsey,Boy With Luv (feat. Halsey),0.645,0.862,0.879566,0.0845,0.09330,0.000000,0.1930,0.803,0.492855,0.042795,1


max_row = df['A'].idxmax()

In [452]:
ts, vs = DataProcessing.split(data, 0.9)




In [454]:
counter =0
smallerts = ts[:len(ts)//4]
smallervs = vs[:len(vs)//2]
stac = ['Danceability', 'Energy', 'Loudness', 'Acousticness', 'Instrumentalness',
'Liveness', 'Valence', 'Tempo', 'Duration_ms']
for i, r in smallervs.iterrows():
    lis = DataProcessing.getDistances(smallerts, r, stac, DataProcessing.manhattan, 3)
    recs = DataProcessing.sort(smallerts, lis)
    f100 = recs[:100]
    if DataProcessing.NaiveBayes(f100, r, 'Hit', stac) == r['Hit']:
        counter+=1
counter/len(smallervs)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  x['distance'] = lis


0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


0.8537313432835821