In [1]:
import numpy as np
import pandas as pd

In [2]:
# данные из файла "lastfm.csv"
data = pd.read_csv("lastfm.csv")
data[0:15]

Unnamed: 0,user,artist,sex,country
0,1,red hot chili peppers,f,Germany
1,1,the black dahlia murder,f,Germany
2,1,goldfrapp,f,Germany
3,1,dropkick murphys,f,Germany
4,1,le tigre,f,Germany
5,1,schandmaul,f,Germany
6,1,edguy,f,Germany
7,1,jack johnson,f,Germany
8,1,eluveitie,f,Germany
9,1,the killers,f,Germany


In [3]:
# выберем страны, по которым имеется достаточно данных (первые 15)
data.country.value_counts().sort_values(ascending = False)[0:15]

United States         59558
United Kingdom        27638
Germany               24251
Poland                17111
Sweden                12379
Brazil                11922
Russian Federation    11676
Finland               10157
Netherlands            9673
Spain                  9322
Canada                 6928
Australia              6407
France                 5962
Italy                  5717
Turkey                 4762
Name: country, dtype: int64

Выбранные страны: Германия, Нидерланды, Австралия. Для проведения дальнейшего анализа подготовим данные. В результате подготовки получим DataFrame с информацией о поле и стране пользователя, а также столбцами, соответствующими различным музыкальным коллективам. Если пользователь слушает музыкальный коллектив, то в ячейке будет значение 1, иначе 0.

## Группировка и бинаризация

In [4]:
# сгруппируем данные
groupped = data.groupby(['user', 'sex', 'country'])['artist'].apply(';'.join)
groupped

user   sex  country       
1      f    Germany           red hot chili peppers;the black dahlia murder;...
3      m    United States     devendra banhart;boards of canada;cocorosie;ap...
4      m    United Kingdom    tv on the radio;tool;kyuss;dj shadow;air;a tri...
5      m    Finland           dream theater;ac/dc;metallica;iron maiden;bob ...
6      m    Portugal          lily allen;kanye west;sigur rós;pink floyd;ste...
                                                    ...                        
19713  m    Ukraine           armin van buuren;above & beyond;atb;ferry cors...
19714  m    United Kingdom    misfits;type o negative;arch enemy;red hot chi...
19715  m    United Kingdom    abba;james blunt;jason mraz;amy winehouse;quee...
19717  m    Brazil            marilyn manson;beyoncé;madonna;t.a.t.u.;katy p...
19718  f    Canada            beirut;of montreal;black flag;the new pornogra...
Name: artist, Length: 15000, dtype: object

In [5]:
# столбцы sex и country уберем из скиска индексированных
binary_data = groupped.str.get_dummies(";").reset_index(level=['sex','country'])
# из binary_data можно выбрать данные, относящиеся к определенной стране или полу
binary_data.iloc[:10, :10]

Unnamed: 0_level_0,sex,country,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,f,Germany,0,0,0,0,0,0,0,0
3,m,United States,0,0,0,0,0,0,0,0
4,m,United Kingdom,0,0,0,0,0,0,0,0
5,m,Finland,0,0,0,0,0,0,0,0
6,m,Portugal,0,0,0,0,0,0,0,0
7,m,Finland,0,0,0,0,0,0,0,0
9,m,United States,0,0,0,0,0,0,0,0
12,m,Italy,0,0,0,0,0,0,0,0
13,m,Austria,0,0,0,0,0,0,0,0
14,m,"Korea, Republic of",0,0,0,0,0,0,0,0


In [6]:
# статистика по музыкальным коллективам
binary_data.drop(columns=["sex", "country"]).apply(np.sum, axis = 1).describe()

count    15000.000000
mean        19.330200
std         10.500698
min          1.000000
25%         11.000000
50%         19.000000
75%         27.000000
max         76.000000
dtype: float64

## Группы

In [7]:
# данные по пользователям из Германии, Финляндии и Австралии
selectedcountry = binary_data[(binary_data.country == "Germany")|(binary_data.country == "Netherlands")|(binary_data.country == "Australia")]
selectedcountry.iloc[:10, :10]

Unnamed: 0_level_0,sex,country,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,f,Germany,0,0,0,0,0,0,0,0
31,f,Netherlands,0,0,0,0,0,0,0,0
33,f,Germany,0,0,0,0,0,0,0,0
40,f,Australia,0,0,0,0,0,0,0,0
42,f,Germany,0,0,0,0,0,0,0,0
51,f,Germany,0,0,0,0,0,0,0,0
62,f,Germany,0,0,0,1,0,0,0,0
75,m,Germany,0,0,0,1,0,0,0,0
130,m,Germany,0,0,0,0,0,0,0,0
141,m,Germany,0,0,0,0,0,0,0,0


In [8]:
# данные по количеству пользователей 
selectedcountry.sex.value_counts()

m    1520
f     494
Name: sex, dtype: int64

In [9]:
# отдельные данные по мужчинам и женщинам
selectedcountry_f = selectedcountry[selectedcountry.sex == "f"]
selectedcountry_m = selectedcountry[selectedcountry.sex == "m"]
selectedcountry_m.iloc[:10, :10]

Unnamed: 0_level_0,sex,country,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
75,m,Germany,0,0,0,1,0,0,0,0
130,m,Germany,0,0,0,0,0,0,0,0
141,m,Germany,0,0,0,0,0,0,0,0
144,m,Germany,0,0,0,0,0,0,0,0
150,m,Germany,0,0,0,0,0,0,0,0
173,m,Australia,0,0,0,0,0,0,0,0
184,m,Netherlands,0,0,0,0,0,0,0,0
247,m,Germany,0,0,0,0,0,0,0,0
248,m,Netherlands,0,0,0,0,0,0,0,0
249,m,Netherlands,0,0,0,0,0,0,0,0


In [10]:
# отдельные данные по Германии, Финляндии и Австралии
germany = selectedcountry[selectedcountry.country == "Germany"]
netherlands = selectedcountry[selectedcountry.country == "Netherlands"]
australia = selectedcountry[selectedcountry.country == "Australia"]
australia.iloc[:10, :10]

Unnamed: 0_level_0,sex,country,...and you will know us by the trail of dead,2pac,3 doors down,30 seconds to mars,311,36 crazyfists,44,50 cent
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
40,f,Australia,0,0,0,0,0,0,0,0
173,m,Australia,0,0,0,0,0,0,0,0
499,m,Australia,0,0,0,0,0,0,0,0
879,m,Australia,0,0,1,1,0,0,0,0
894,m,Australia,0,0,0,0,0,0,0,0
938,m,Australia,0,0,0,0,0,0,0,0
1066,m,Australia,0,0,0,0,0,0,0,0
1164,f,Australia,0,0,0,0,0,0,0,0
1419,m,Australia,0,0,0,0,0,0,0,0
1442,f,Australia,0,0,0,0,0,0,0,0


In [11]:
# данные по количеству пользователей из Германии
germany.sex.value_counts()

m    938
f    319
Name: sex, dtype: int64

In [12]:
# данные по количеству пользователей из Нидерландов
netherlands.sex.value_counts()

m    383
f     84
Name: sex, dtype: int64

In [13]:
# данные по количеству пользователей из Австралии
australia.sex.value_counts()

m    199
f     91
Name: sex, dtype: int64

## Характерные комбинации музыкальных коллективов

In [14]:
# !pip install mlxtend
import mlxtend.frequent_patterns as ml

In [15]:
# характерные комбинации, подсчитанные по всем пользователям из 3 стран
freq_selectedcountry = ml.apriori(selectedcountry.drop(columns=["sex", "country"]), min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_selectedcountry))
freq_selectedcountry.sort_values(by = "support", ascending = False)[0:10]

Найдено 75 характерных комбинаций


Unnamed: 0,support,itemsets
11,0.167825,(coldplay)
55,0.151936,(red hot chili peppers)
34,0.141509,(linkin park)
52,0.133565,(radiohead)
63,0.132075,(the beatles)
17,0.122642,(die Ärzte)
54,0.121152,(rammstein)
42,0.112214,(muse)
61,0.111718,(system of a down)
66,0.107746,(the killers)


In [16]:
# характерные комбинации для женщин
freq_selectedcountry_f = ml.apriori(selectedcountry_f.drop(columns=["sex", "country"]), min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_selectedcountry_f))
freq_selectedcountry_f.sort_values(by = "support", ascending = False)[0:10]

Найдено 121 характерных комбинаций


Unnamed: 0,support,itemsets
19,0.208502,(coldplay)
84,0.17004,(red hot chili peppers)
54,0.161943,(linkin park)
64,0.161943,(muse)
100,0.159919,(the kooks)
29,0.145749,(evanescence)
99,0.145749,(the killers)
78,0.139676,(placebo)
96,0.137652,(the beatles)
27,0.133603,(die Ärzte)


In [17]:
# характерные комбинации для мужчин
freq_selectedcountry_m = ml.apriori(selectedcountry_m.drop(columns=["sex", "country"]), min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_selectedcountry_m))
freq_selectedcountry_m.sort_values(by = "support", ascending = False)[0:10]

Найдено 70 характерных комбинаций


Unnamed: 0,support,itemsets
11,0.154605,(coldplay)
51,0.146053,(red hot chili peppers)
48,0.138158,(radiohead)
32,0.134868,(linkin park)
59,0.130263,(the beatles)
50,0.127632,(rammstein)
35,0.121711,(metallica)
16,0.119079,(die Ärzte)
57,0.118421,(system of a down)
19,0.106579,(foo fighters)


## Сопоставление характерных комбинаций для двух выборок по полу

In [18]:
gen = set(freq_selectedcountry_f.itemsets) & set(freq_selectedcountry_m.itemsets)
print("Совпадают %d комбинации:" % len(gen))
print([set(x) for x in gen])

Совпадают 45 комбинации:
[{'the white stripes'}, {'koЯn'}, {'radiohead'}, {'die toten hosen'}, {'coldplay'}, {'depeche mode'}, {'rammstein'}, {'foo fighters'}, {'portishead'}, {'red hot chili peppers'}, {'the kooks'}, {'air'}, {'system of a down'}, {'johnny cash'}, {'kanye west'}, {'die Ärzte'}, {'nine inch nails'}, {'placebo'}, {'the cure'}, {'beatsteaks'}, {'bloc party'}, {'billy talent'}, {'nightwish'}, {'blink-182'}, {'kings of leon'}, {'death cab for cutie'}, {'amy winehouse'}, {'evanescence'}, {'oasis'}, {'the beatles'}, {'daft punk'}, {'linkin park'}, {'metallica'}, {'the killers'}, {'arctic monkeys'}, {'the offspring'}, {'snow patrol'}, {'green day'}, {'incubus'}, {'jack johnson'}, {'muse'}, {'rise against'}, {'bullet for my valentine'}, {'nirvana'}, {'queens of the stone age'}]


In [19]:
gen = set(freq_selectedcountry_f.itemsets) - set(freq_selectedcountry_m.itemsets)
print("Отличаются %d комбинаций" % len(gen))
print("Женщины слушают:", [set(x) for x in gen])

Отличаются 76 комбинаций
Женщины слушают: [{'tegan and sara'}, {'the all-american rejects'}, {'björk'}, {'fall out boy'}, {'the kooks', 'coldplay'}, {'justin timberlake'}, {'joy division'}, {'maria mena'}, {'franz ferdinand'}, {'damien rice'}, {'sigur rós'}, {'red hot chili peppers', 'coldplay'}, {'kate nash'}, {'subway to sally'}, {'papa roach'}, {'good charlotte'}, {'mika'}, {'elliott smith'}, {'marilyn manson'}, {'rihanna'}, {'linkin park', 'evanescence'}, {'belle and sebastian'}, {'peter fox'}, {'the fray'}, {'maxïmo park'}, {'david bowie'}, {'deichkind'}, {'lily allen'}, {'coldplay', 'muse'}, {'simple plan'}, {'feist'}, {'norah jones'}, {'my chemical romance'}, {'beyoncé'}, {'the wombats'}, {'mgmt'}, {'schandmaul'}, {'[unknown]'}, {'kaiser chiefs'}, {'30 seconds to mars'}, {'arctic monkeys', 'coldplay'}, {'keane'}, {'the smashing pumpkins'}, {'tori amos'}, {'him'}, {'regina spektor'}, {'the kooks', 'jack johnson'}, {'clueso'}, {'lady gaga'}, {'modest mouse'}, {'paramore'}, {'kelly

In [20]:
gen = set(freq_selectedcountry_m.itemsets) - set(freq_selectedcountry_f.itemsets)
print("Отличаются %d комбинаций" % len(gen))
print("Мужчины слушают:", [set(x) for x in gen])

Отличаются 25 комбинаций
Мужчины слушают: [{'jimi hendrix'}, {'u2'}, {'disturbed'}, {'iron maiden'}, {'madonna'}, {'the rolling stones'}, {'moby'}, {'massive attack'}, {'slipknot'}, {'michael jackson'}, {'tool'}, {'tenacious d'}, {'beck'}, {'pink floyd'}, {'bob dylan'}, {'in flames'}, {'queen'}, {'the prodigy'}, {'ac/dc'}, {'sum 41'}, {'limp bizkit'}, {'the chemical brothers'}, {'röyksopp'}, {'gorillaz'}, {'rage against the machine'}]


## Характерные комбинации для разных стран 

In [21]:
# характерные комбинации для Германии
freq_germany = ml.apriori(germany.drop(columns=["sex", "country"]), min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_germany))
freq_germany.sort_values(by = "support", ascending = False)[0:10]

Найдено 82 характерных комбинаций


Unnamed: 0,support,itemsets
22,0.194909,(die Ärzte)
41,0.157518,(linkin park)
16,0.156722,(coldplay)
60,0.147176,(red hot chili peppers)
59,0.14638,(rammstein)
68,0.128878,(system of a down)
46,0.119332,(metallica)
21,0.115354,(die toten hosen)
9,0.105012,(billy talent)
71,0.105012,(the killers)


In [22]:
# характерные комбинации для Нидерландов
freq_netherlands = ml.apriori(netherlands.drop(columns=["sex", "country"]), min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_netherlands))
freq_netherlands.sort_values(by = "support", ascending = False)[0:10]

Найдено 97 характерных комбинаций


Unnamed: 0,support,itemsets
11,0.211991,(coldplay)
66,0.188437,(radiohead)
75,0.17773,(the beatles)
69,0.175589,(red hot chili peppers)
54,0.132762,(muse)
4,0.12848,(arctic monkeys)
12,0.12848,(daft punk)
80,0.119914,(the kooks)
81,0.119914,(the prodigy)
0,0.115632,(air)


In [23]:
# характерные комбинации для Австралии
freq_australia = ml.apriori(australia.drop(columns=["sex", "country"]), min_support = 0.05, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_australia))
freq_australia.sort_values(by = "support", ascending = False)[0:10]

Найдено 121 характерных комбинаций


Unnamed: 0,support,itemsets
70,0.227586,(radiohead)
86,0.182759,(the beatles)
55,0.144828,(muse)
17,0.144828,(coldplay)
44,0.137931,(linkin park)
11,0.137931,(bloc party)
73,0.134483,(red hot chili peppers)
22,0.131034,(death cab for cutie)
40,0.127586,(kings of leon)
21,0.127586,(david bowie)


## Сопоставление характерных комбинаций для разных стран

In [24]:
gen = set(freq_germany.itemsets) & set(freq_netherlands.itemsets) & set(freq_australia.itemsets)
print("Совпадают %d комбинации:" % len(gen))
print([set(x) for x in gen])

Совпадают 35 комбинации:
[{'the white stripes'}, {'franz ferdinand'}, {'radiohead'}, {'coldplay'}, {'rammstein'}, {'foo fighters'}, {'portishead'}, {'red hot chili peppers'}, {'the kooks'}, {'air'}, {'system of a down'}, {'johnny cash'}, {'kanye west'}, {'placebo'}, {'massive attack'}, {'bloc party'}, {'nightwish'}, {'queens of the stone age'}, {'kings of leon'}, {'death cab for cutie'}, {'evanescence'}, {'the beatles'}, {'daft punk'}, {'queen'}, {'metallica'}, {'the prodigy'}, {'linkin park'}, {'the killers'}, {'arctic monkeys'}, {'green day'}, {'incubus'}, {'jack johnson'}, {'muse'}, {'nirvana'}, {'rage against the machine'}]


In [25]:
gen = set(freq_germany.itemsets) - set(freq_netherlands.itemsets) - set(freq_australia.itemsets)
print("Отличаются %d комбинации:" % len(gen))
print("Жители Германии слушают:", [set(x) for x in gen])

Отличаются 37 комбинации:
Жители Германии слушают: [{'die Ärzte', 'die toten hosen'}, {'koЯn'}, {'die toten hosen'}, {'böhse onkelz'}, {'subway to sally'}, {'die Ärzte', 'system of a down'}, {'papa roach'}, {'bob marley'}, {'disturbed'}, {'peter fox'}, {'maxïmo park'}, {'deichkind'}, {'die Ärzte'}, {'iron maiden'}, {'beatsteaks'}, {'slipknot'}, {'billy talent'}, {'schandmaul'}, {'[unknown]'}, {'30 seconds to mars'}, {'tenacious d'}, {'in extremo'}, {'seeed'}, {'in flames'}, {'clueso'}, {'linkin park', 'die Ärzte'}, {'3 doors down'}, {'ac/dc'}, {'farin urlaub'}, {'sum 41'}, {'limp bizkit'}, {'rammstein', 'die Ärzte'}, {'red hot chili peppers', 'die Ärzte'}, {'rammstein', 'system of a down'}, {'nelly furtado'}, {'bullet for my valentine'}, {'mando diao'}]


In [26]:
gen = set(freq_netherlands.itemsets) - set(freq_germany.itemsets) - set(freq_australia.itemsets)
print("Отличаются %d комбинации:" % len(gen))
print("Жители Нидерландов слушают:", [set(x) for x in gen])

Отличаются 36 комбинации:
Жители Нидерландов слушают: [{'boards of canada'}, {'jimi hendrix'}, {'maria mena'}, {'jamiroquai'}, {'damien rice'}, {'elvis presley'}, {'editors'}, {'radiohead', 'coldplay'}, {'fatboy slim'}, {'red hot chili peppers', 'the beatles'}, {'deus'}, {'r.e.m.'}, {'beastie boys'}, {'groove armada'}, {'eels'}, {'thievery corporation'}, {'the doors'}, {'madonna'}, {'coldplay', 'muse'}, {'norah jones'}, {'moby'}, {'goldfrapp'}, {'kaiser chiefs'}, {'robbie williams'}, {'snow patrol', 'coldplay'}, {'arctic monkeys', 'coldplay'}, {'keane'}, {'arctic monkeys', 'radiohead'}, {'faithless'}, {'dire straits'}, {'amy macdonald'}, {'justice'}, {'gorillaz'}, {'frank sinatra'}, {'enya'}, {'aphex twin'}]


In [27]:
gen = set(freq_australia.itemsets) - set(freq_germany.itemsets) - set(freq_netherlands.itemsets)
print("Отличаются %d комбинации:" % len(gen))
print("Жители Австралии слушают:", [set(x) for x in gen])

Отличаются 56 комбинации:
Жители Австралии слушают: [{'the all-american rejects'}, {'linkin park', 'coldplay'}, {'joy division'}, {'radiohead', 'bloc party'}, {'justin timberlake'}, {'pixies'}, {'tv on the radio'}, {'anberlin'}, {'animal collective'}, {'the beatles', 'coldplay'}, {'good charlotte'}, {'afi'}, {'taking back sunday'}, {'elliott smith'}, {'silverchair'}, {'arcade fire'}, {'eminem'}, {'david bowie'}, {'foo fighters', 'radiohead'}, {'lily allen'}, {'the smiths'}, {'dashboard confessional'}, {'my chemical romance'}, {'brand new'}, {'mogwai'}, {'radiohead', 'björk'}, {'mgmt'}, {'tom waits'}, {'sonic youth'}, {'the smashing pumpkins'}, {'tori amos'}, {'regina spektor'}, {'the postal service'}, {'lady gaga'}, {'parkway drive'}, {'modest mouse'}, {'paramore'}, {'radiohead', 'the white stripes'}, {'the used'}, {'radiohead', 'pink floyd'}, {'the presets'}, {'sufjan stevens'}, {'a perfect circle'}, {'maroon 5'}, {'britney spears'}, {'motion city soundtrack'}, {'bob dylan', 'the beat

## Ассоциативные правила

### Метод Apriory

In [28]:
# поддержка
freq_selectedcountry_apriori = ml.apriori(selectedcountry.drop(columns=["sex", "country"]), min_support = 0.03, use_colnames = True)
print("Найдено %d характерных комбинаций" % len(freq_selectedcountry_apriori))
freq_selectedcountry_apriori.sort_values('support', ascending = False)[0:10]

Найдено 228 характерных комбинаций


Unnamed: 0,support,itemsets
43,0.167825,(coldplay)
148,0.151936,(red hot chili peppers)
104,0.141509,(linkin park)
143,0.133565,(radiohead)
168,0.132075,(the beatles)
52,0.122642,(die Ärzte)
145,0.121152,(rammstein)
121,0.112214,(muse)
166,0.111718,(system of a down)
173,0.107746,(the killers)


In [29]:
# ассоциативные правила
rules = ml.association_rules(freq_selectedcountry_apriori, metric = "confidence", min_threshold = 0.3)
print("Найдено %d правил" % len(rules))
rules.sort_values(by = "antecedent support", ascending = False)

Найдено 28 правил


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
11,(red hot chili peppers),(coldplay),0.151936,0.167825,0.046177,0.303922,1.810941,0.020678,1.195519
15,(die Ärzte),(die toten hosen),0.122642,0.072989,0.040715,0.331984,4.548404,0.031764,1.387707
25,(rammstein),(system of a down),0.121152,0.111718,0.043694,0.360656,3.22827,0.030159,1.389364
10,(muse),(coldplay),0.112214,0.167825,0.036743,0.327434,1.951039,0.01791,1.237312
23,(muse),(radiohead),0.112214,0.133565,0.035253,0.314159,2.352107,0.020265,1.263318
18,(system of a down),(die Ärzte),0.111718,0.122642,0.03575,0.32,2.609231,0.022048,1.290233
26,(system of a down),(rammstein),0.111718,0.121152,0.043694,0.391111,3.22827,0.030159,1.443364
13,(the killers),(coldplay),0.107746,0.167825,0.03426,0.317972,1.894664,0.016178,1.220148
21,(metallica),(red hot chili peppers),0.107249,0.151936,0.032274,0.300926,1.980604,0.015979,1.213124
22,(metallica),(system of a down),0.107249,0.111718,0.032274,0.300926,2.693621,0.020292,1.270655


### Метод fpgrowth

In [30]:
freq_selectedcountry_fpgrowth = ml.fpgrowth(selectedcountry.drop(columns=["sex", "country"]), min_support=0.03, use_colnames=True)
print("Найдено %d характерных комбинаций" % len(freq_selectedcountry_fpgrowth))
freq_selectedcountry_fpgrowth.sort_values(by = "support", ascending = False)[0:10]

Найдено 228 характерных комбинаций


Unnamed: 0,support,itemsets
14,0.167825,(coldplay)
0,0.151936,(red hot chili peppers)
15,0.141509,(linkin park)
8,0.133565,(radiohead)
69,0.132075,(the beatles)
16,0.122642,(die Ärzte)
71,0.121152,(rammstein)
145,0.112214,(muse)
72,0.111718,(system of a down)
1,0.107746,(the killers)


In [31]:
rules = ml.association_rules(freq_selectedcountry_fpgrowth, metric = "confidence", min_threshold = 0.3)
print("Найдено %d правил" % len(rules))
rules.sort_values(by = "antecedent support", ascending = False)

Найдено 28 правил


Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction
0,(red hot chili peppers),(coldplay),0.151936,0.167825,0.046177,0.303922,1.810941,0.020678,1.195519
10,(die Ärzte),(die toten hosen),0.122642,0.072989,0.040715,0.331984,4.548404,0.031764,1.387707
17,(rammstein),(system of a down),0.121152,0.111718,0.043694,0.360656,3.22827,0.030159,1.389364
26,(muse),(radiohead),0.112214,0.133565,0.035253,0.314159,2.352107,0.020265,1.263318
25,(muse),(coldplay),0.112214,0.167825,0.036743,0.327434,1.951039,0.01791,1.237312
18,(system of a down),(rammstein),0.111718,0.121152,0.043694,0.391111,3.22827,0.030159,1.443364
16,(system of a down),(die Ärzte),0.111718,0.122642,0.03575,0.32,2.609231,0.022048,1.290233
1,(the killers),(coldplay),0.107746,0.167825,0.03426,0.317972,1.894664,0.016178,1.220148
20,(metallica),(red hot chili peppers),0.107249,0.151936,0.032274,0.300926,1.980604,0.015979,1.213124
19,(metallica),(system of a down),0.107249,0.111718,0.032274,0.300926,2.693621,0.020292,1.270655


### Метод fpmax

In [32]:
freq_selectedcountry_fpmax = ml.fpmax(selectedcountry.drop(columns=["sex", "country"]), min_support=0.03, use_colnames=True)
print("Найдено %d характерных комбинаций" % len(freq_selectedcountry_fpmax))
freq_selectedcountry_fpmax.sort_values(by = "support", ascending = False)[0:10]

Найдено 207 характерных комбинаций


Unnamed: 0,support,itemsets
172,0.087885,(air)
170,0.086395,(daft punk)
169,0.085402,(placebo)
168,0.081927,(the white stripes)
167,0.08143,(the prodigy)
165,0.077458,(evanescence)
164,0.076465,(kings of leon)
163,0.075472,(incubus)
162,0.073486,(death cab for cutie)
161,0.073486,(green day)


In [33]:
# построить правила не получилось
# rules = ml.association_rules(freq_selectedcountry_fpmax, metric = "confidence", min_threshold = 0.3)
# print("Найдено %d правил" % len(rules))
# rules.sort_values(by = "antecedent support", ascending = False)[0:15]

## Тривиальные и нетривиальные правила

Поиск примеров тривиальных и нетривиальных правил основывался на стиле музыки разных групп. Если группы одинаково известные, стиль похож и язык, на котором исполняются песни, одинаковый, то правило тривиальное, иначе - нет. Также правило можно считать тривиальным, если оно включает в себя группу и одного из ее членов, выступающих отдельно.<br/><br/>
<b>Тривиальные:</b><br/> 
(farin urlaub) (die Ärzte): солист и один из основателей рок-группы Die Ärzte и группа Die Ärzte<br/>
(red hot chili peppers) (coldplay): жанр - рок, альтернативный рок, первая из Америки, вторая из Великобритании, язык - английский<br/>
(die Ärzte)	(die toten hosen): жанр - панк-рок, язык - немецкий, обе группы из Германии и достаточно популярные в этой стране<br/>
(die toten hosen) (die Ärzte): аналогично<br/><br/>
<b>Нетривиальные:</b><br/> 
(system of a down) (die Ärzte) 1: жанр - альтернативный метал, язык - английский, группа из Америки; 2: жанр - панк-рок, язык - немецкий, группа из Германии<br/>
(arctic monkeys) (bloc party): 1: жанр - гаражный рок, психоделический рок; 2: жанр - инди-рок, арт-рок, электро, постпанк; обе из Великобритании, язык - английский