In [54]:
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pickle
import os
import re

## Конечная классификации по жанрам

In [55]:
def load_from_json(filename):
    """Load data from a JSON file."""
    if os.path.exists(filename):
        with open(filename, 'r') as json_file:
            return json.load(json_file)
    else:
        return {}

In [56]:
df_tracks = pd.read_pickle('tracks_df.pickle')

In [60]:
eng_tracks = load_from_json('lyrics_eng_fin.json')
len(eng_tracks)

32729

In [57]:
other_tracks = load_from_json('lyrics_other_fin.json')
len(other_tracks)

6048

In [61]:
df_tracks.head()

Unnamed: 0,album_type,album_id,album_name,album_release_data,album_artists_names,album_artists_ids,album_total_tracks,track_artists_names,track_artists_ids,track_name,...,id,uri,track_href,analysis_url,duration_ms,time_signature,playlist_id,genre,total_followers,total_tracks
0,album,6Kssm2LosQ0WyLukFZkEG5,Demi,2013-01-01,Demi Lovato,6S2OmqARrzebs0tKUEyXyp,13,Demi Lovato,6S2OmqARrzebs0tKUEyXyp,Heart Attack,...,1V6gIisPpYqgFeWbMLI0bA,spotify:track:1V6gIisPpYqgFeWbMLI0bA,https://api.spotify.com/v1/tracks/1V6gIisPpYqg...,https://api.spotify.com/v1/audio-analysis/1V6g...,210840,4,6gS3HhOiI17QNojjPuPzqc,pop,22857.0,428
1,single,65L5VcKGKe6sOJIPNFc0Q1,One in a Million,2023-08-04,Bebe Rexha:artist_custom_separator:David Guetta,64M6ah0SkkRsnPGtGiRAbb:artist_custom_separator...,1,Bebe Rexha:artist_custom_separator:David Guetta,64M6ah0SkkRsnPGtGiRAbb:artist_custom_separator...,One in a Million,...,3YfGTvsTAWGC2lgoDOikUz,spotify:track:3YfGTvsTAWGC2lgoDOikUz,https://api.spotify.com/v1/tracks/3YfGTvsTAWGC...,https://api.spotify.com/v1/audio-analysis/3YfG...,160530,4,6gS3HhOiI17QNojjPuPzqc,pop,22857.0,428
2,single,5Q5PomrTdI13qpmEeM393B,On My Love (Sped Up),2023-09-12,Zara Larsson:artist_custom_separator:David Guetta,1Xylc3o4UrD53lo9CvFvVg:artist_custom_separator...,4,Zara Larsson:artist_custom_separator:David Guetta,1Xylc3o4UrD53lo9CvFvVg:artist_custom_separator...,On My Love,...,0K6iKgHPPhAb4Acmg9aD77,spotify:track:0K6iKgHPPhAb4Acmg9aD77,https://api.spotify.com/v1/tracks/0K6iKgHPPhAb...,https://api.spotify.com/v1/audio-analysis/0K6i...,222912,4,6gS3HhOiI17QNojjPuPzqc,pop,22857.0,428
3,album,6fpLLJsDSSAlToEDW2jv4F,Animal (Expanded Edition),2010-01-01,Kesha,6LqNN22kT3074XbTVUrhzX,18,Kesha,6LqNN22kT3074XbTVUrhzX,TiK ToK,...,0HPD5WQqrq7wPWR7P7Dw1i,spotify:track:0HPD5WQqrq7wPWR7P7Dw1i,https://api.spotify.com/v1/tracks/0HPD5WQqrq7w...,https://api.spotify.com/v1/audio-analysis/0HPD...,199693,4,6gS3HhOiI17QNojjPuPzqc,pop,22857.0,428
4,album,20Ol6zZ0nLlc5EGTH1zA0j,Delirium (Deluxe),2015-11-06,Ellie Goulding,0X2BH1fck6amBIoJhDVmmJ,22,Ellie Goulding,0X2BH1fck6amBIoJhDVmmJ,"Love Me Like You Do - From ""Fifty Shades Of Grey""",...,3zHq9ouUJQFQRf3cm1rRLu,spotify:track:3zHq9ouUJQFQRf3cm1rRLu,https://api.spotify.com/v1/tracks/3zHq9ouUJQFQ...,https://api.spotify.com/v1/audio-analysis/3zHq...,252534,4,6gS3HhOiI17QNojjPuPzqc,pop,22857.0,428


In [70]:
df_tracks.shape

(145732, 39)

In [71]:
df_tracks.drop_duplicates('id', inplace=True)

In [73]:
df_tracks.shape

(126997, 39)

In [63]:
df_tracks['lyrics_eng'] = df_tracks['id'].map(lambda x: eng_tracks.get(x, {}).get('lyrics'))
df_tracks['lyrics_other'] = df_tracks['id'].map(lambda x: other_tracks.get(x, {}).get('lyrics'))

In [72]:
df_tracks.genre.nunique()

1477

In [74]:
def count_tracks(series):
    with_lyrics = series.count()
    n_tracks = len(series)
    return pd.Series({'with_lyrics': with_lyrics})

result = df_tracks.groupby('genre').agg(
    lyrics_eng=('lyrics_eng', count_tracks),
    lyrics_other=('lyrics_other', count_tracks),
    n_songs=('genre', 'size')
).reset_index()

In [75]:
result.head()

Unnamed: 0,genre,lyrics_eng,lyrics_other,n_songs
0,5th wave emo,36,0,51
1,aarhus indie,16,7,97
2,abstract hip hop,35,0,64
3,acid rock,28,3,85
4,acoustic pop,63,0,80


In [76]:
result.genre.nunique(), result.n_songs.sum()

(1477, 126997)

Дропаем `def_not_english`

In [77]:
def_not_english = result[(result['lyrics_eng'] == 0) & (result['lyrics_other'] >= 0)]
print(def_not_english['genre'].nunique(), def_not_english['n_songs'].sum())

130 11997


In [78]:
filt_genres = result.query('not (lyrics_eng == 0 and lyrics_other >= 0)')
filt_genres.shape

(1347, 4)

**Вопрос1:** что делать с `mainly_not_english`

In [79]:
mainly_not_english = filt_genres[filt_genres['lyrics_eng'] < filt_genres['lyrics_other']].sort_values('lyrics_other', ascending=False)
mainly_not_english['lyrics_eng'].sum()

616

In [81]:
mainly_not_english.shape

(174, 4)

С этим мы работаем `eng_genres`

In [82]:
eng_genres = filt_genres.query('not lyrics_eng < lyrics_other')
eng_genres.sort_values(['lyrics_eng','n_songs'], ascending=False).head(50)

Unnamed: 0,genre,lyrics_eng,lyrics_other,n_songs
1025,pop,100,0,100
1116,rap,100,0,100
1179,rock,99,0,100
1256,singer-songwriter pop,96,0,100
269,christian alternative rock,88,0,99
1067,pov: indie,88,2,92
862,modern rock,88,1,89
320,contemporary country,87,0,100
615,hyperpop,87,0,98
26,alternative metal,87,0,87


**Вопрос2:** по разбиению жанров, я предлагаю 10 ниже, которые указаны ниже (hip hop в один сольется).


И добрать песен еще, чтоб +/- равно было.

Можно убрать 'folk',
  'country',
  'soul'

In [88]:
from collections import Counter
word_counts = Counter(' '.join(eng_genres['genre']).lower().split())

word_counts_df = pd.DataFrame(list(word_counts.items()), columns=['word', 'count']).sort_values('count', ascending=False).head(15)

genres = word_counts_df.word.tolist()
genres = [g for g in genres if g not in ['black', 'canadian', 'alternative', 'deep']]

In [89]:
genres, len(genres)

(['indie',
  'metal',
  'rock',
  'pop',
  'hip',
  'hop',
  'punk',
  'rap',
  'folk',
  'country',
  'soul'],
 11)

In [90]:
def create_final_genre(genre):
    final_genre = [word for word in genres if word in genre.lower()]
    if 'hip' in final_genre and 'hop' in final_genre:
        final_genre.remove('hip')
        final_genre.remove('hop')
        final_genre.append('hip hop')
    return final_genre

eng_genres['final_genre'] = eng_genres['genre'].apply(create_final_genre)
eng_genres.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  eng_genres['final_genre'] = eng_genres['genre'].apply(create_final_genre)


(1173, 6)

In [91]:
eng_genres_filtered = eng_genres[eng_genres['final_genre'].apply(lambda x: len(x) > 0)]
eng_genres_filtered.shape

(1102, 6)

In [92]:
one_genre = eng_genres_filtered[eng_genres_filtered['final_genre'].apply(lambda x: len(x) == 1)]
one_genre['final_genre'] = one_genre['final_genre'].apply(tuple)
one_genre

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  one_genre['final_genre'] = one_genre['final_genre'].apply(tuple)


Unnamed: 0,genre,lyrics_eng,lyrics_other,n_songs,norm,final_genre
1,aarhus indie,16,7,97,16.494845,"(indie,)"
2,abstract hip hop,35,0,64,54.687500,"(hip hop,)"
3,acid rock,28,3,85,32.941176,"(rock,)"
4,acoustic pop,63,0,80,78.750000,"(pop,)"
5,acoustic punk,18,0,53,33.962264,"(punk,)"
...,...,...,...,...,...,...
1472,yacht rock,41,1,73,56.164384,"(rock,)"
1473,york indie,24,0,91,26.373626,"(indie,)"
1474,yorkshire folk,8,0,72,11.111111,"(folk,)"
1475,zim hip hop,10,3,100,10.000000,"(hip hop,)"


In [93]:
grouped_one = one_genre.groupby('final_genre')['lyrics_eng'].sum().reset_index()
grouped_one.rename({'lyrics_eng': 'count'}, axis=1, inplace=True)
grouped_one['percentage'] = grouped_one['count'] / grouped_one['count'].sum() * 100
grouped_one.sort_values('percentage', ascending=False)

Unnamed: 0,final_genre,count,percentage
3,"(indie,)",5198,18.92315
5,"(pop,)",4735,17.237613
4,"(metal,)",4567,16.626015
8,"(rock,)",3387,12.330263
7,"(rap,)",2914,10.608322
2,"(hip hop,)",2694,9.807419
6,"(punk,)",1757,6.396301
0,"(country,)",762,2.774036
9,"(soul,)",740,2.693946
1,"(folk,)",715,2.602934


**Вопрос2** что делать с `mul_genre`

In [94]:
mul_genre = eng_genres_filtered[eng_genres_filtered['final_genre'].apply(lambda x: len(x) > 1)]

In [95]:
mul_genre['final_genre'] = mul_genre['final_genre'].apply(tuple)
grouped_mul = mul_genre.groupby('final_genre')['lyrics_eng'].sum().reset_index()
grouped_mul.rename({'lyrics_eng': 'count'}, axis=1, inplace=True)
grouped_mul['percentage'] = grouped_mul['count'] / grouped_mul['count'].sum() * 100
grouped_mul.sort_values('percentage', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mul_genre['final_genre'] = mul_genre['final_genre'].apply(tuple)


Unnamed: 0,final_genre,count,percentage
7,"(indie, rock)",444,16.075308
3,"(indie, pop)",343,12.418537
15,"(pop, punk)",308,11.15134
1,"(indie, folk)",269,9.739319
12,"(pop, country)",185,6.698045
24,"(rock, pop)",121,4.380883
22,"(rock, folk)",115,4.16365
11,"(metal, rap)",92,3.33092
4,"(indie, pop, rap)",73,2.643012
20,"(rap, soul)",73,2.643012


In [None]:
grouped_one = one_genre.groupby('final_genre')['lyrics_eng'].sum().reset_index()
grouped_one.rename({'lyrics_eng': 'count'}, axis=1, inplace=True)
grouped_one['percentage'] = grouped_one['count'] / grouped_one['count'].sum() * 100
grouped_one.sort_values('percentage', ascending=False)

Unnamed: 0,final_genre,count,percentage
5,"(pop,)",4971,20.299739
4,"(metal,)",3971,16.216106
8,"(rock,)",3741,15.27687
7,"(rap,)",3010,12.291735
3,"(indie,)",2672,10.911467
2,"(hip hop,)",2589,10.572525
6,"(punk,)",1248,5.096374
0,"(country,)",910,3.716106
9,"(soul,)",787,3.213819
1,"(folk,)",589,2.40526


In [None]:
mul_genre['final_genre'] = mul_genre['final_genre'].apply(tuple)
grouped_mul = mul_genre.groupby('final_genre')['lyrics_eng'].sum().reset_index()
grouped_mul.rename({'lyrics_eng': 'count'}, axis=1, inplace=True)
grouped_mul['percentage'] = grouped_mul['count'] / grouped_mul['count'].sum() * 100
grouped_mul.sort_values('percentage', ascending=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mul_genre['final_genre'] = mul_genre['final_genre'].apply(tuple)


Unnamed: 0,final_genre,count,percentage
6,"(indie, rock)",359,12.04698
13,"(pop, punk)",335,11.241611
2,"(indie, pop)",326,10.939597
0,"(indie, folk)",257,8.624161
10,"(pop, country)",199,6.677852
20,"(rock, folk)",147,4.932886
22,"(rock, pop)",146,4.899329
19,"(rock, country)",143,4.798658
9,"(metal, rap)",102,3.422819
14,"(pop, rap)",98,3.288591
