##PCA of n-grams on CRIM Dataset 

#Installation and Imports

Install music21 and crim-intervals

In [1]:
ABS_PATH = "/content/" # absolute path to my working directory


from intervals import*
from intervals import main_objs

import music21
import os, glob # file I/O
from tqdm import tqdm # status bar for loops
import re # regular expressions
import requests # to download files

import numpy as np # numerical operations
import pandas as pd # to work with tabular data
import intervals as ci # crim intervals

from collections import Counter
from sklearn.decomposition import PCA # dimensionality reduction
from sklearn.cluster import KMeans # clustering

# data visualization
import matplotlib.pyplot as plt # plots
from matplotlib.patches import Circle
import seaborn as sns
sns.set_context("notebook")
w = 10
figsize = (w, w/1.333)

# score rendering
# import verovio
from IPython.core.display import HTML

#Import CRIM Dataset

In [46]:
raw_prefix = "https://raw.githubusercontent.com/CRIM-Project/CRIM-online/master/crim/static/mei/MEI_4.0/"
URL = "https://api.github.com/repos/CRIM-Project/CRIM-online/git/trees/990f5eb3ff1e9623711514d6609da4076257816c"
piece_json = requests.get(URL).json()
piece_list = [raw_prefix + p["path"] for p in piece_json["tree"]]

In [47]:
piece_list[1:5]
piece_list = piece_list[1:5]

In [48]:
d = "data/"
if not os.path.exists(d):
    os.makedirs(d)
    
img = "img/"
if not os.path.exists(img):
    os.makedirs(img)

In [49]:
for piece in tqdm(piece_list):
    filename = piece.split("/")[-1] # only the part after the last '/' is the filename    
    
    if not os.path.isfile(d + filename):
        with open(d + filename, "w") as f:
            r = requests.get(piece)
            f.write(r.text)

100%|███████████████████████████████████| 4/4 [00:00<00:00, 7796.10it/s]


In [50]:
local_files = glob.glob("data/*.mei")

In [51]:
local_files = [ f  for f in local_files if re.match(r".+_\d.mei$", f) ]

In [52]:
local_files = local_files[1:6]

In [53]:
local_files = sorted(local_files)
local_files

['data/CRIM_Mass_0001_3.mei',
 'data/CRIM_Mass_0001_4.mei',
 'data/CRIM_Mass_0003_1.mei',
 'data/CRIM_Mass_0003_4.mei',
 'data/CRIM_Mass_0003_5.mei']

In [24]:
print(local_files[17])
del local_files[17] 

data/CRIM_Mass_0004_3.mei


In [58]:
corpus_list = ['https://crimproject.org/mei/CRIM_Model_0010.mei',
              'https://crimproject.org/mei/CRIM_Model_0011.mei',
             'https://crimproject.org/mei/CRIM_Model_0014.mei', 
              'https://crimproject.org/mei/CRIM_Mass_0008_1.mei',
 'https://crimproject.org/mei/CRIM_Mass_0008_2.mei',
              'https://crimproject.org/mei/CRIM_Mass_0008_3.mei',
    'https://crimproject.org/mei/CRIM_Mass_0008_4.mei',
    'https://crimproject.org/mei/CRIM_Mass_0008_5.mei',
    'https://crimproject.org/mei/CRIM_Mass_0009_1.mei',
 'https://crimproject.org/mei/CRIM_Mass_0009_2.mei',
              'https://crimproject.org/mei/CRIM_Mass_0009_3.mei',
    'https://crimproject.org/mei/CRIM_Mass_0009_4.mei',
    'https://crimproject.org/mei/CRIM_Mass_0009_5.mei',
    'https://crimproject.org/mei/CRIM_Mass_0012_1.mei',
 'https://crimproject.org/mei/CRIM_Mass_0012_2.mei',
              'https://crimproject.org/mei/CRIM_Mass_0012_3.mei',
    'https://crimproject.org/mei/CRIM_Mass_0012_4.mei',
    'https://crimproject.org/mei/CRIM_Mass_0012_5.mei']

In [81]:
len(corpus_list)

18

In [57]:
corpus_list = []
for f in local_files:
    corpus_list.append(f)
corpus_list   

['data/CRIM_Mass_0001_3.mei',
 'data/CRIM_Mass_0001_4.mei',
 'data/CRIM_Mass_0003_1.mei',
 'data/CRIM_Mass_0003_4.mei',
 'data/CRIM_Mass_0003_5.mei']

In [59]:
corpus = ci.CorpusBase(corpus_list) # this takes a while...

Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Model_0010.mei
Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Model_0011.mei
Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Model_0014.mei
Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Mass_0008_1.mei
Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Mass_0008_2.mei
Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Mass_0008_3.mei
Downloading remote score...
Error downloading https://crimproject.org/mei/CRIM_Mass_0008_4.mei, please check your url and try again. Continuing to next file.
Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Mass_0008_5.mei
Downloading remote score...
Successfully imported https://crimproject.org/mei/CRIM_Mass_0009_1.mei
Downloading remote score...
Successfully imported htt

#Explore Dataset 

#Cadences

In [60]:
corpus.scores[0].cadences()

Unnamed: 0,CadType,LeadingTones,CVFs,Low,RelLow,Tone,RelTone,TSig,Measure,Beat,Sounding,Progress,SinceLast,ToNext
80.0,Clausula Vera,0.0,CT,A2,M2,A,M2,4/2,11,1.0,4.0,0.035778,80.0,72.0
152.0,Clausula Vera,1.0,tCT,G2,P1,G,P8,4/2,20,1.0,4.0,0.067979,72.0,40.0
192.0,Evaded Authentic,1.0,Cu,E3,M6,G,P8,4/2,25,1.0,3.0,0.085868,40.0,40.0
232.0,Clausula Vera,1.0,CT,C3,P4,C,P4,4/2,30,1.0,3.0,0.103757,40.0,120.0
352.0,Clausula Vera,0.0,TC,G3,P8,G,P8,4/2,45,1.0,3.0,0.157424,120.0,8.0
360.0,Clausula Vera,0.0,CT,G3,P8,G,P8,4/2,46,1.0,4.0,0.161002,8.0,4.0
364.0,Authentic,1.0,CtB,C3,P4,C,P4,4/2,46,3.0,4.0,0.162791,4.0,8.0
372.0,Authentic,1.0,TCzB,C3,P4,C,P4,4/2,47,3.0,4.0,0.166369,8.0,16.0
388.0,Clausula Vera,1.0,CAT,G3,P8,G,P8,4/2,49,3.0,4.0,0.173524,16.0,112.0
500.0,Authentic,1.0,CuTB,G2,P1,G,P8,4/2,63,3.0,5.0,0.223614,112.0,52.0


In [61]:
cadences = corpus.scores[0].cadences(keep_keys=True)
col_list = ['Measure', 'Beat', 'CadType', 'Pattern', 'Key', 'Tone','LeadingTones', 'CVFs', 'Low','RelLow','RelTone', 'Sounding', 'Progress','SinceLast','ToNext']
cadences = cadences[col_list]
cadences.groupby(['Tone', 'CadType', 'CVFs']).size().reset_index(name='counts')

Unnamed: 0,Tone,CadType,CVFs,counts
0,A,Clausula Vera,CT,2
1,A,Clausula Vera,CTx,1
2,C,Authentic,CtB,1
3,C,Authentic,CuTB,1
4,C,Authentic,TCzB,1
5,C,Authentic,tuCB,1
6,C,Clausula Vera,CT,2
7,C,Double Leading Tone,ACT,1
8,C,Evaded Clausula Vera,tC,1
9,D,Authentic,CTB,1


In [62]:
cadences = corpus.scores[1].cadences(keep_keys=True)
col_list = ['Measure', 'Beat', 'CadType', 'Pattern', 'Key', 'Tone','LeadingTones', 'CVFs', 'Low','RelLow','RelTone', 'Sounding', 'Progress','SinceLast','ToNext']
cadences = cadences[col_list]
cadences.groupby(['Tone', 'CadType', 'CVFs']).size().reset_index(name='counts')

Unnamed: 0,Tone,CadType,CVFs,counts
0,B-,Authentic,CtB,1
1,B-,Clausula Vera,CT,1
2,D,Authentic,CB,1
3,D,Authentic,TCB,1
4,D,Phrygian Clausula Vera,CT,2
5,F,Evaded Authentic,Cb,1
6,G,Abandoned Authentic,zCxx,1
7,G,Authentic,CTB,2
8,G,Authentic,CtB,1
9,G,Clausula Vera,CT,1


#Intervals

In [63]:
# Sort All Intervals by Size and Direction, with counts for each voice

int_order = ["P1", "m2", "M2", "m3", "M3", "P4", "P5", "m6", "M6", "m7", "M7", "P8",
             "-m2", "-M2", "-m3", "-M3", "-P4", "-P5", "-m6", "-M6", "-m7", "-M7", "-P8"]
mel = corpus.scores[0].melodic()
mel = mel.fillna("-")

# count up the values in each item column--sum for each pitch.  
# make a copy to be sure we don't mess up
mel = mel.apply(pd.Series.value_counts).fillna(0).astype(int).reset_index().copy()

# rename the index column to something more useful
mel.rename(columns = {'index':'interval'}, inplace = True)

# apply the categorical list and sort
mel['interval'] = pd.Categorical(mel["interval"], categories=int_order)
mel = mel.sort_values(by = "interval").dropna().copy()
mel.reset_index()

Unnamed: 0,index,interval,Superius,Contratenor,Primus Tenor,Secundus Tenor,Bassus
0,11,P1,38,50,53,56,42
1,16,m2,39,40,53,63,36
2,9,M2,61,89,132,139,89
3,17,m3,14,19,9,20,15
4,10,M3,6,4,6,10,5
5,12,P4,13,9,17,16,18
6,13,P5,3,6,11,8,11
7,14,P8,0,2,3,2,3
8,6,-m2,50,54,59,61,46
9,1,-M2,83,94,156,111,112


In [71]:
!pip uninstall matplotliby

!pip install matplotlib==3.1.3

Collecting matplotlib==3.1.3
  Using cached matplotlib-3.1.3.tar.gz (40.9 MB)
Building wheels for collected packages: matplotlib
  Building wheel for matplotlib (setup.py) ... [?25ldone
[?25h  Created wheel for matplotlib: filename=matplotlib-3.1.3-cp39-cp39-macosx_10_9_x86_64.whl size=8400828 sha256=68dfef6c4392fcce1483fcce96f69d213fd8706fba9933c0a4dc371574f06145
  Stored in directory: /Users/rfreedma/Library/Caches/pip/wheels/88/5f/33/d7b8943eba74fdfbd535c83cefcf366c25b0f9cb6424e763e7
Successfully built matplotlib
Installing collected packages: matplotlib
  Attempting uninstall: matplotlib
    Found existing installation: matplotlib 3.4.3
    Uninstalling matplotlib-3.4.3:
      Successfully uninstalled matplotlib-3.4.3
Successfully installed matplotlib-3.1.3


#Melodic n-grams

In [72]:
#Get list of work names and composers from the corpus

#This returns a dictionary with metadata for each work
#{'title': 'Missa Confitemini: Kyrie', 'composer': 'Pierre Colin'}

# for i in np.arange(219):
#   print(corpus.scores[i].metadata)


In [73]:
#Create a dataframe which gives the melodic n-grams for each voice entrance in the work
#Rows index is the measure number of the entrance of the n-gram in the work

nr = corpus.scores[0].notes(combineUnisons=True)
mel = corpus.scores[0].melodic(df=nr, kind='d', end=False)
ng = corpus.scores[0].ngrams(df=mel, n=3)
mask = corpus.scores[0].entryMask()
result = ng[mask].dropna(how='all').fillna('')
result

Unnamed: 0,Superius,Contratenor,Primus Tenor,Secundus Tenor,Bassus
0.0,"(3, 2, -2)","(3, 3, 2)",,,
8.0,,,"(3, 2, 2)",,
48.0,,,,"(3, 2, -2)","(5, -2, 2)"
94.0,,,"(4, -2, 2)",,
104.0,"(3, -3, -2)",,,,
...,...,...,...,...,...
2164.0,"(-2, 3, -4)",,,,
2176.0,,,,,"(2, 2, 2)"
2186.0,,"(-2, 2, 2)",,,
2214.0,,,,"(-2, -3, 2)",


In [74]:
#Create melodic n-gram dataframes for each work in the corpus

func1 = ImportedPiece.melodic
list_of_dfs = corpus.batch(func=func1, kwargs={'kind': 'd', 'end': False}, metadata=False)
func2 = ImportedPiece.ngrams
list_of_melodic_ngrams = corpus.batch(func=func2, kwargs={'n': 4, 'df': list_of_dfs}, metadata=True)
# title_of_output = pd.concat(list_of_melodic_ngrams)



In [75]:
#create a dataframe that gives the number of each melodic n-grams in the work


list_of_melodic_ngrams[0].value_counts().to_frame()
n_gram_count = list_of_melodic_ngrams[0].stack().value_counts().to_frame()
n_gram_count

#drop the first row of dataframe which contains total number of n-grams
n_gram_count = n_gram_count.iloc[2:, :]

#Convert row indices list into a column
n_gram_count = n_gram_count.reset_index(level=0)


#Rename columns of n_gram_count
n_gram_count.rename(columns = {'index':'n-gram', 0:'n-gram count'}, inplace = True)




print("This is the n-gram count for " + corpus.scores[0].metadata['title'] + " by " + corpus.scores[0].metadata['composer'] + ":")
n_gram_count

This is the n-gram count for Quare fremuerunt gentes by Claudin de Sermisy:


Unnamed: 0,n-gram,n-gram count
0,"(-2, -2, -2, -2)",80
1,"(-2, -2, -2, 2)",75
2,"(2, -2, -2, -2)",59
3,"(2, 2, 2, 2)",55
4,"(2, 2, 2, -2)",45
...,...,...
514,"(-2, -3, 4, 2)",1
515,"(-2, -2, 1, 1)",1
516,"(-3, -3, 5, 2)",1
517,"(8, 1, 1, 1)",1


In [76]:
for a in np.arange(n_gram_count.shape[0]):
  delimiter = ','
  n_gram_count['n-gram'].iloc[a] = delimiter.join(n_gram_count['n-gram'].iloc[a])

n_gram_count



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,n-gram,n-gram count
0,"-2,-2,-2,-2",80
1,"-2,-2,-2,2",75
2,"2,-2,-2,-2",59
3,2222,55
4,"2,2,2,-2",45
...,...,...
514,"-2,-3,4,2",1
515,"-2,-2,1,1",1
516,"-3,-3,5,2",1
517,8111,1


In [84]:
#Create a list of panda dataframes for each work in the corpus


list_of_n_gram_count_dataframes = []

for i in np.arange(16):

  list_of_melodic_ngrams[i].value_counts().to_frame()
  n_gram_count = list_of_melodic_ngrams[i].stack().value_counts().to_frame()
  n_gram_count

  #drop the first row of dataframe which contains total number of n-grams
  n_gram_count = n_gram_count.iloc[2:, :]

  #Convert row indices list into a column
  n_gram_count = n_gram_count.reset_index(level=0)

  #Rename columns of n_gram_count
  n_gram_count.rename(columns = {'index':'n-gram', 0:'n-gram count'}, inplace = True)

  list_of_n_gram_count_dataframes.append(n_gram_count)

In [79]:
#use outer merge to get list of unique values in list_of_n_gram_count_dataframes

all_n_grams = pd.concat(list_of_n_gram_count_dataframes, join='outer', axis=0).fillna(0)


all_n_grams
#List of all unique n-grams in the corpus
all_n_grams['n-gram']


# #Convert the n-gram column to a string object
# all_n_grams['n-gram'] = all_n_grams['n-gram'].astype('string')   

all_n_grams['n-gram']
#(this will be used as the columns in the bag of words matrix)



#convert these tuple of strings into string for column names....

for i in np.arange(75431):
  delimiter = ','
  all_n_grams['n-gram'].iloc[i] = delimiter.join(all_n_grams['n-gram'].iloc[i])


# all_n_grams['n-gram']       

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


IndexError: single positional indexer is out-of-bounds

In [80]:
#Create list of works in the corpus

list_of_works_in_corpus = []


for i in np.arange(219):
  list_of_works_in_corpus.append(corpus.scores[i].metadata['title'] + " by " + corpus.scores[i].metadata['composer'])


IndexError: list index out of range

In [None]:
#Build a bag of words matrix... rows = individual work in corpus
#Columns are the list of all n-grams
#Entries are the frequency of each n-gram in the work

#Create empty bag of words matrix with rows as works and columns as all n-grams
bag_of_words_matrix = pd.DataFrame(columns = all_n_grams['n-gram'], index = list_of_works_in_corpus )
bag_of_words_matrix.fillna(0)


In [None]:
#n-gram count dataframe for each work in corpus

for i in np.arange(219):
  # (Get matrix of total n-gram count for ith work in dataset)

  list_of_melodic_ngrams[i].value_counts().to_frame()
  n_gram_count = list_of_melodic_ngrams[i].stack().value_counts().to_frame()
  #drop the first row of dataframe which contains total number of n-grams
  n_gram_count = n_gram_count.iloc[2:, :]
  #Convert row indices list into a column
  n_gram_count = n_gram_count.reset_index(level=0)
  #Rename columns of n_gram_count
  n_gram_count.rename(columns = {'index':'n-gram', 0:'n-gram count'}, inplace = True)

  #convert the tuples of strings to a single string
  for a in np.arange(n_gram_count.shape[0]):
    delimiter = ','
    n_gram_count['n-gram'].iloc[a] = delimiter.join(n_gram_count['n-gram'].iloc[a])


#for the ith row in bag_of_words_matrix, add the corresponding values from the ith n_gram_count
#go through the rows of the ith n_gram_count and add the n-gram count to the same n-gram in the bag_of_words_matrix

  for j in np.arange(n_gram_count['n-gram'].shape[0]):      #go through length of n_gram_count for ith work
    #for each n-gram in n_gram_count, add the corresponding value to the ith column of the bag_of_words_matrix

    bag_of_words_matrix.loc[list_of_works_in_corpus[i], n_gram_count['n-gram'].iloc[j]] = n_gram_count['n-gram count'].iloc[j]  
    
  

In [None]:
bag_of_words_matrix = bag_of_words_matrix.fillna(0)
bag_of_words_matrix


In [None]:
bag_of_words_matrix.index

#Run PCA on n-gram dataset

-Run PCA and K-means clustering to find clusters of works that are closely related in terms of the n-grams

In [None]:
plt.scatter(bag_of_words_matrix.iloc[:,0], bag_of_words_matrix.iloc[:,1])

plt.xlabel(f"Number of '-2,-2,-2,2' n-grams")
plt.ylabel(f"Number of '2,-2,-2,-2' n-grams")



In [None]:
#Standardize data before running PCA
from sklearn.preprocessing import StandardScaler

bag_of_words_matrix_scaled = pd.DataFrame(StandardScaler().fit_transform(bag_of_words_matrix),columns = bag_of_words_matrix.columns)
bag_of_words_matrix_scaled

In [None]:
series = pd.Series(np.arange(bag_of_words_matrix_scaled.shape[0])) 
series = series.astype(int)

#Define labeling function for graphs

def label_point(x, y, val, ax):
    a = pd.concat({'x': x, 'y': y, 'val': val}, axis=1)
    for i, point in a.iterrows():
        ax.text(point['x']+.02, point['y'], str(point['val']))

In [None]:
#Run PCA
from sklearn.decomposition import PCA
pca  = PCA(n_components=2)

pca_components = pca.fit_transform(bag_of_words_matrix_scaled)
print(pca_components.shape)

In [None]:
#creating a new dataframe including target
ngram_pca = pd.DataFrame(np.hstack((np.array(bag_of_words_matrix_scaled.index).reshape(-1,1), pca_components)),columns=['Work','1st_component','2nd_component'])
ngram_pca.head()

In [None]:
#Create PCA Plot
import seaborn as sns
sns.scatterplot(ngram_pca['1st_component'],ngram_pca['2nd_component'])
label_point(ngram_pca['1st_component'], ngram_pca['2nd_component'],series, plt.gca()) 
plt.title('Scatter-plot')
plt.show()

In [None]:
#Create labeled plot of PCA
sns.set(font_scale=.5)
sns.scatterplot(ngram_pca['1st_component'],ngram_pca['2nd_component'])
plt.title('Scatter-plot')
label_point(ngram_pca['1st_component'], ngram_pca['2nd_component'],series, plt.gca()) 
plt.savefig("n-gram_PCA.png")
plt.show()

#K-means Clustering

In [None]:
#Use Silhouette scores to determine the optimal number of K clusters to use
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import silhouette_score


# candidate values for our number of cluster
parameters = [2, 3, 4, 5]
# instantiating ParameterGrid, pass number of clusters as input
parameter_grid = ParameterGrid({'n_clusters': parameters})
best_score = -1
kmeans_model = KMeans()     # instantiating KMeans model
silhouette_scores = []
# evaluation based on silhouette_score
for p in parameter_grid:
    kmeans_model.set_params(**p)    # set current hyper parameter
    kmeans_model.fit(bag_of_words_matrix_scaled)          # fit model on wine dataset, this will find clusters based on parameter p
    ss = silhouette_score(ngram_pca, kmeans_model.labels_)   # calculate silhouette_score
    silhouette_scores += [ss]       # store all the scores
    print('Parameter:', p, 'Score', ss)
    # check p which has the best score
    if ss > best_score:
        best_score = ss
        best_grid = p
# plotting silhouette score
plt.bar(range(len(silhouette_scores)), list(silhouette_scores), align='center', color='#722f59', width=0.5)
plt.xticks(range(len(silhouette_scores)), list(parameters))
plt.title('Silhouette Score', fontweight='bold')
plt.xlabel('Number of Clusters')
plt.ylabel('Silhouette Scores')
plt.show()



In [None]:
# fitting KMeans 
from sklearn.cluster import KMeans

K = 3

KMeans = KMeans(n_clusters=K)
KMeans.fit(ngram_pca)

In [None]:
KMeans.cluster_centers_


In [None]:
cluster_clrs = ['yellow','purple','red']

In [None]:
#predict the labels of clusters.
label = KMeans.fit_predict(ngram_pca)
label = pd.DataFrame(label)

In [None]:
ngram_pca.append(label, ignore_index=True)
ngram_pca['label'] = label
ngram_pca

In [None]:
#Plot the clusters


for i in np.arange(ngram_pca.shape[0]):
  if ngram_pca.iloc[i,3] == 0:
    plt.scatter(ngram_pca.iloc[i,1], ngram_pca.iloc[i,2], s=10, c='red', label ='Cluster 1')
  if ngram_pca.iloc[i,3] == 1:
    plt.scatter(ngram_pca.iloc[i,1], ngram_pca.iloc[i,2], s=10, c='blue', label ='Cluster 2')
  if ngram_pca.iloc[i,3] == 2:
    plt.scatter(ngram_pca.iloc[i,1], ngram_pca.iloc[i,2], s=10, c='green', label ='Cluster 3')

plt.title('K-mean Clustering for n-gram PCA')
label_point(ngram_pca['1st_component'], ngram_pca['2nd_component'],series, plt.gca()) 
plt.savefig("K-mean Clustering n-gram_PCA.png")
plt.xlabel('PCA 1')
plt.ylabel('PCA 2')

plt.figure(figsize=(20, 20))


plt.show()


#Gaussian Mixture Model

In [None]:
# import numpy as np
# import matplotlib.pyplot as plt
# from matplotlib.colors import LogNorm
# from sklearn import mixture




#*** Need more RAM to run this.... ***



# # fit a Gaussian Mixture Model with two components
# clf = mixture.GaussianMixture(n_components=2, covariance_type="full")
# clf.fit(bag_of_words_matrix_scaled)

# # display predicted scores by the model as a contour plot
# x = np.linspace(-100, 100)
# y = np.linspace(-100, 100)
# X, Y = np.meshgrid(x, y)
# XX = np.array([X.ravel(), Y.ravel()]).T
# Z = -clf.score_samples(XX)
# Z = Z.reshape(X.shape)

# CS = plt.contour(
#     X, Y, Z, norm=LogNorm(vmin=1.0, vmax=1000.0), levels=np.logspace(0, 3, 10)
# )

# CB = plt.colorbar(CS, shrink=0.8, extend="both")
# plt.scatter(bag_of_words_matrix_scaled[:, 0], bag_of_words_matrix_scaled[:, 1], 0.8)

# plt.title("Negative log-likelihood predicted by a GMM")
# plt.axis("tight")
# plt.show()



