<a href="https://colab.research.google.com/github/Vhaatever/Music-Plagiarsim/blob/main/Music_Plagiarism_using_analytical_schema.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import librosa
import sklearn
import os
import time
import math
from sklearn.datasets import load_digits
from sklearn.manifold import LocallyLinearEmbedding
from scipy.optimize import linear_sum_assignment
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [None]:
#Applying LLE
def apply_LLE(Xdb):
  embedding = LocallyLinearEmbedding(n_components=3)
  X_transformed = embedding.fit_transform(Xdb.transpose())
  return X_transformed

In [None]:
#Feature extraction
def extract_all(name,sampling_rate):
  x,sr= librosa.load(name,sampling_rate)
  x=x
  output=spectrogram(x)[0,:]
  output=np.vstack((output, centroid(x, sampling_rate)))
  output=np.vstack((output, roll_off(x)))
  output=np.vstack((output, MFCC(x)))
  output=np.vstack((output, chromatogram(x, sampling_rate)))
  #output=np.vstack((output, FWHM(x, sampling_rate)))
  return output.transpose()


In [None]:
class Music:
  def __init__(self, path, input_index):
   '''
   :param name: address of the music
   '''
   if path is None:
    return
   self.index=input_index
   self.name = path
   self.features =extract_all(self.name,sampling_rate)
   '''
   self.spectrogram = self.features[:,0]
   self.centroid = self.features[:,1]
   self.roll_off = self.features[:,2]
   self.MFCC = self.features[:,3]
   self.chromatogram = self.features[:,4]
   '''
   #self.FWHM = self.features[:,5]
   print(self.index)

  def split_into_pieces(self, word_length ,overlap_rate):
    pieces=[]
    self.word_length=word_length
    self.overlap_rate=overlap_rate
    overlap=int(word_length*overlap_rate)
    jump= word_length-overlap
    idx=0
    while(idx < len(self.features[:,0])):
      if idx + word_length <= len(self.features[:,0]):
        pieces.append(self.features[idx:idx+word_length])
      else:
        pieces.append(self.features[idx:])
      idx += jump
    self.pieces=pieces
    self.num_words=len(self.pieces)
    self.num_letters=len(self.pieces[0])
    self.num_features=len(self.pieces[0][0])

  
  def take_sub_string(self, num):
    self.pieces=self.pieces[:num]
    self.num_words=len(self.pieces)
    self.num_letters=len(self.pieces[0])
    self.num_features=len(self.pieces[0][0])
    print("Number of words- %d" % len(self.pieces))
   # print(pieces)
    print("Number of letters- %d" % len(self.pieces[0]))
    #print(pieces[0])
    print("Number of features- %d" % len(self.pieces[0][0]))


In [None]:
#Individual features 
def spectrogram(x):
  X = librosa.stft(x)
  Xdb = librosa.amplitude_to_db(abs(X))
  X_transformed=apply_LLE(Xdb)
  return X_transformed.transpose()

def centroid(x, sampling_rate):
  spectral_centroids = librosa.feature.spectral_centroid(x, sr=sampling_rate)[0]
  scaler.fit_transform(spectral_centroids.reshape(-1,1))
  spectral_centroids = scaler.transform(spectral_centroids.reshape(-1,1))

  return spectral_centroids.transpose()

def roll_off(x):
  spectral_rolloff = librosa.feature.spectral_rolloff(x+0.01, sr=sampling_rate)[0]
  scaler.fit_transform(spectral_rolloff.reshape(-1,1))
  spectral_rolloff = scaler.transform(spectral_rolloff.reshape(-1,1))

  return spectral_rolloff.transpose()

def FWHM(x, sampling_rate):
  spectral_bandwidth_2 = librosa.feature.spectral_bandwidth(x+0.01, sr=sampling_rate)[0]
  spectral_bandwidth_3 = librosa.feature.spectral_bandwidth(x+0.01, sr=sampling_rate, p=3)[0]
  spectral_bandwidth_4 = librosa.feature.spectral_bandwidth(x+0.01, sr=sampling_rate, p=4)[0]
  output=np.empty_like(spectral_bandwidth_2)
  output=np.vstack((output, spectral_bandwidth_2))
  output=np.vstack((output, spectral_bandwidth_3))
  output=np.vstack((output, spectral_bandwidth_4))
  output= apply_LLE(output)
  return output.transpose()

def MFCC(x):
  mfccs = librosa.feature.mfcc(x)
  mfccs = apply_LLE(mfccs)
  scaler.fit_transform(mfccs)
  mfccs = scaler.transform(mfccs)

  return mfccs.transpose()

def chromatogram(x, sampling_rate):
  chromagram = librosa.feature.chroma_stft(x, sr=sampling_rate)
  chromagram = apply_LLE(chromagram)
  scaler.fit_transform(chromagram)
  chromagram = scaler.transform(chromagram)
  return chromagram.transpose()

In [None]:
  #Edit Distance....... 
def distance_song(song1, song2, feature_weights):
    answer=[]
    row=0
    col=0
    for i in range(len(feature_weights)):
      feature_weight= feature_weights[i]
      num_word1= song1.num_words
      num_word2= song2.num_words
      bipartite_matrix=[[0]*num_word2]*num_word1
      
      for x in range(num_word1):
        word1=np.array(song1.pieces[x])
        for y in range(num_word2):
          word2=np.array(song2.pieces[y])
          bipartite_matrix[x][y]=edit_distance_dp(word1[:,i], word2[:,i], feature_weight)
      row,col= linear_sum_assignment(bipartite_matrix)
      answer.append(np.array(bipartite_matrix)[row,col].sum())
    return answer
                                                        

In [None]:
def edit_distance_dp(seq1, seq2, feature_weight):

  cost = np.zeros((len(seq1), len(seq2)))

  for row in range(len(seq1)):
    for col in range(len(seq2)):
      ins_cost = 5*feature_weight*(abs(seq1[row]))
      del_cost = 5*feature_weight*(abs(seq2[col]))
      sub_cost = 0.1*feature_weight*(abs(seq1[row]-seq2[col]))
      if seq1[row-1] == seq2[col-1]:
        cost[row][col] = cost[row-1][col-1]
      else: 
        insertion_cost = cost[row-1][col] + ins_cost
        deletion_cost = cost[row][col-1] + del_cost
        substitution_cost = cost[row-1][col-1] + sub_cost
        # calculate the minimum cost
        cost[row][col]= min(insertion_cost, deletion_cost, substitution_cost)
        # get the operation
  return cost[len(seq1)-1, len(seq2)-1]          

In [None]:
#Driver class
data_path='drive/MyDrive/A SOP Songs/'
sampling_rate=20000
max_songs=6

In [None]:
files = os.listdir(data_path)
file_num = len(files)
musics = []
i=0
time_feature=0
for file in files:
  if(i<max_songs):
    start=time.time()
    music = Music(data_path + file, i)
    end=time.time()
    print(file[:-3]+" - Feature Extracted")
    print(f"Time taken to extract features {end - start}")
    time_feature=time_feature+(end-start)
    musics.append(music)
    i=i+1
print(f"Time taken for feature extraction{time_feature}")
print(f"Average Time taken for feature extraction{time_feature/i}")


0
Elvis Presley - Marguerita 1963. - Feature Extracted
Time taken to extract features 30.34813117980957
1
Oasis - Whatever (Official Video). - Feature Extracted
Time taken to extract features 198.56486821174622
2
How Sweet to Be an Idiot (2007 Remaster). - Feature Extracted
Time taken to extract features 20.393435955047607
3
Matt Cardle - Amazing. - Feature Extracted
Time taken to extract features 36.11031699180603
4
Ed Sheeran - Photograph (Official Music Video). - Feature Extracted
Time taken to extract features 50.23399496078491
5
Rappin 4 Tay - Players Club. - Feature Extracted
Time taken to extract features 73.91081237792969
Time taken for feature extraction409.561559677124
Average Time taken for feature extraction68.26025994618733


In [None]:
Distance = [[0 for i in range(i)] for j in range(i)]
time_distance=0
max_songs=8
spectrogram_weight=0.4
mfcc_weight=0.5
feature_weights=[spectrogram_weight,spectrogram_weight,spectrogram_weight, 0.3,0.2,mfcc_weight,mfcc_weight,mfcc_weight,0.1]
reference_list=['spectrogram, centroid, roll_off, MFCC, chromatogram']
num_letter=50
overlap_rate=0.05
words_chosen=25
for song in musics:
  start=time.time()
  song.split_into_pieces(num_letter,overlap_rate)
  end=time.time()
  print(f"Time taken to split the strings is {end - start}")
  song.take_sub_string(words_chosen)
for song1 in musics:
  for song2 in musics:
    if(song1!=song2 and song1.index<max_songs and song2.index<max_songs):
      print(song1.name[26:-3] + " Compared with " + song2.name[26:-3])
      start=time.time()
      cost_songs= distance_song(song1, song2, feature_weights)
      cost=math.sqrt(np.mean(np.array(cost_songs)**2))
      end=time.time()
      print(f"Total Time taken for this comparison {end - start}")
      time_distance=time_distance+(end-start)
      print(f"The distance between the the two songs along the difference matrix is {cost}")    
      if(song1.index!=song2.index):
        Distance[song1.index][song2.index]=cost
print(f"Time taken to calculate the distance {time_distance}")
print(f"Average Time taken to calculate one distance {time_distance/i}")
print("This is the distance")
print("************")
print("************")
print("************")
for j in range(i):
  print(str(j)+": "+ musics[j].name[26:-3])
print('\n'.join('{}: {}'.format(*k) for k in enumerate(Distance)))

Final_Distance= np.sqrt((np.array(Distance))**2+(np.array(Distance)**2).transpose())
print("final")
for j in range(i):
  print(str(j)+": "+ musics[j].name[26:-3])
print(Final_Distance)

Time taken to split the strings is 0.0004146099090576172
Number of words- 25
Number of letters- 50
Number of features- 9
Time taken to split the strings is 0.0008423328399658203
Number of words- 25
Number of letters- 50
Number of features- 9
Time taken to split the strings is 0.0003848075866699219
Number of words- 25
Number of letters- 50
Number of features- 9
Time taken to split the strings is 0.0005519390106201172
Number of words- 25
Number of letters- 50
Number of features- 9
Time taken to split the strings is 0.0005812644958496094
Number of words- 25
Number of letters- 50
Number of features- 9
Time taken to split the strings is 0.0005714893341064453
Number of words- 25
Number of letters- 50
Number of features- 9
Elvis Presley - Marguerita 1963. Compared with Oasis - Whatever (Official Video).


In [None]:
def find_rank(Final_distance):
  num_rows= len(Final_distance)
  for rows in range(num_rows):
    Song1= Final_distance[rows,:]
    Song2=Song1.copy()
    Song1= np.delete(Song1, rows)
    element= min(Song1)
    print(element)
    print(Song2)
    out=find(element,Song2)
    print("Song :" +str(rows)+" Plagiarized to song :" +str(out))


def find(element, matrix):
  for i in range(len(matrix)):
    if matrix[i] == element:
      return i

In [None]:
find_rank(Final_Distance)