## Libraries

In [4]:
import pandas as pd
import numpy as np
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori
from mlxtend.frequent_patterns import fpgrowth
from mlxtend.frequent_patterns import association_rules

In [3]:
pip install mlxtend

Collecting mlxtendNote: you may need to restart the kernel to use updated packages.

  Using cached mlxtend-0.22.0-py2.py3-none-any.whl (1.4 MB)
Installing collected packages: mlxtend
Successfully installed mlxtend-0.22.0



[notice] A new release of pip available: 22.3.1 -> 23.1.2
[notice] To update, run: python.exe -m pip install --upgrade pip


### Clean DataFrame

In [5]:
df=pd.read_csv('dataset.csv')
df

  df=pd.read_csv('dataset.csv')


Unnamed: 0.1,Unnamed: 0,OFFENSE_DESCRIPTION,DISTRICT,DAY_OF_WEEK,MONTH,YEAR,HOUR,date,time,UCR_PART,X_Location,Y_Location,OFFENSE_CODE_GROUP,STREET,Lat,Long,SHOOTING,OFFENSE_CODE_GROUP_WITHOUT_ENCODING,STREET_WITHOUT_ENCODING
0,0,disturbing the peace,E18,6,10.0,2018.0,20.0,2018-10-03,20:13,3,42.2,71.1,12,115,42.262608,-71.121186,0,Disorderly Conduct,ARLINGTON ST
1,1,property-lost,D14,4,8.0,2018.0,20.0,2018-08-30,20:00,2,42.3,71.1,46,58,42.352111,-71.135311,0,Property Lost,ALLSTON ST
2,2,threats to do bodily harm,B2,6,10.0,2018.0,19.0,2018-10-03,19:20,3,42.3,71.0,40,779,42.308126,-71.076930,0,Other,DEVON ST
3,3,assault-aggravated-battery,A1,6,10.0,2018.0,20.0,2018-10-03,20:00,1,42.3,71.0,0,441,42.359454,-71.059648,0,Aggravated Assault,CAMBRIDGE ST
4,4,aircraft incidents,A7,6,10.0,2018.0,20.0,2018-10-03,20:49,2,42.3,71.0,1,2182,42.375258,-71.024663,0,Aircraft,PRESCOTT ST
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51980,51981,sick/injured/medical-person,B3,5,4.0,2018.0,14.0,,13:42,2,42.3,71.1,33,2418,42.273417,-71.075174,0,Medical Assistance,SANFORD ST
51981,51982,sick/injured/medical-person,E13,5,4.0,2018.0,13.0,,13:18,2,42.3,71.0,33,1056,42.304922,-71.102981,0,Medical Assistance,FOREST HILLS ST
51982,51983,larceny shoplifting,D4,5,4.0,2018.0,13.0,,14:04,1,42.2,71.0,28,1965,42.350597,-71.078810,0,Larceny,NEWBURY ST
51983,51984,m/v accident-personal injury,B3,5,4.0,2018.0,14.0,,13:46,2,42.2,71.0,37,2311,42.283391,-71.073886,0,Motor Vehicle Accident Response,ROCKWELL ST


### Map Offense code to OFFENSE GROUP

In [10]:
df_code=pd.DataFrame(df,columns=['OFFENSE_CODE_GROUP','OFFENSE_CODE_GROUP_WITHOUT_ENCODING'])
df_code=df_code.drop_duplicates(subset=['OFFENSE_CODE_GROUP'])
df_code=df_code.sort_values(by='OFFENSE_CODE_GROUP')
df_code

Unnamed: 0,OFFENSE_CODE_GROUP,OFFENSE_CODE_GROUP_WITHOUT_ENCODING
3,0,Aggravated Assault
4,1,Aircraft
777,2,Arson
815,3,Assembly or Gathering Violations
11,4,Auto Theft
365,5,Auto Theft Recovery
109,6,Ballistics
9812,7,Bomb Hoax
659,8,Commercial Burglary
100,9,Confidence Games


### Make Date DataFrame for frequent pattern

In [5]:
data1={
    'date transaction':df['date'],
    'OFFENSE_CODE_GROUP':df['OFFENSE_CODE_GROUP'],

}
fq1=pd.DataFrame(data1,columns=['date transaction','OFFENSE_CODE_GROUP'])
fq1=fq1.dropna(subset=['date transaction'])
fq1

Unnamed: 0,date transaction,OFFENSE_CODE_GROUP
0,2018-10-03,12
1,2018-08-30,46
2,2018-10-03,40
3,2018-10-03,0
4,2018-10-03,1
...,...,...
9392,2018-09-27,25
9393,2018-09-27,58
9394,2018-09-04,28
9395,2018-09-04,28


## Extraction dates & events from fq1

In [6]:
fq1_unique=fq1.drop_duplicates(subset='date transaction')
dates =fq1_unique['date transaction'].to_list()
events =fq1.groupby('date transaction')['OFFENSE_CODE_GROUP'].apply(list)


#### We can transform it into the right format via the TransactionEncoder as follows:

In [15]:
te = TransactionEncoder()
te_ary = te.fit(events).transform(events)
date_freq = pd.DataFrame(te_ary, columns=te.columns_)
date_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3001,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3002,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3003,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3004,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


### Reduction date_freq for apriori

In [12]:
selected=date_freq.columns[:15]
df_reduction=date_freq[selected].copy()
df_reduction


Unnamed: 0,0,1,2,3,4,5,6,8,9,10,11,12,13,14,15
0,True,False,False,True,True,True,True,False,True,True,False,True,True,False,False
1,True,False,False,False,True,False,True,False,True,True,False,True,True,False,False
2,True,False,False,True,True,False,True,True,True,True,False,True,True,False,False
3,True,False,False,True,True,False,True,False,True,True,True,False,True,False,False
4,True,False,False,True,True,True,True,True,True,False,False,True,True,False,False
5,True,False,False,False,True,True,True,True,True,False,False,True,True,False,False
6,True,False,False,False,True,True,True,True,True,True,False,False,True,True,True
7,True,False,False,True,True,False,True,False,True,True,False,True,True,False,False
8,True,False,False,True,True,True,False,False,True,False,False,True,True,False,True
9,True,False,False,True,True,False,True,False,True,True,False,True,True,False,True


### Apriori :  let us return the items and itemsets with at least 70% support

### Overview
Apriori is a popular algorithm [1] for extracting frequent itemsets with applications in association rule learning. The apriori algorithm has been designed to operate on databases containing transactions, such as purchases by customers of a store. An itemset is considered as "frequent" if it meets a user-specified support threshold. For instance, if the support threshold is set to 0.5 (50%), a frequent itemset is defined as a set of items that occur together in at least 50% of all transactions in the database.

In [16]:
freq_apriori=apriori(df_reduction, min_support=0.7,verbose=0)

In [17]:
freq_apriori.tail()

Unnamed: 0,support,itemsets
34,0.885714,"(0, 8, 4, 12)"
35,0.8,"(0, 11, 4, 12)"
36,0.714286,"(0, 8, 11, 12)"
37,0.714286,"(8, 11, 4, 12)"
38,0.714286,"(0, 4, 8, 11, 12)"


### fpgrowth: Frequent itemsets via the FP-growth algorithm

### Overview
FP-Growth [1] is an algorithm for extracting frequent itemsets with applications in association rule learning that emerged as a popular alternative to the established Apriori algorighm [2].

In general, the algorithm has been designed to operate on databases containing transactions, such as purchases by customers of a store. An itemset is considered as "frequent" if it meets a user-specified support threshold. For instance, if the support threshold is set to 0.5 (50%), a frequent itemset is defined as a set of items that occur together in at least 50% of all transactions in the database.

In particular, and what makes it different from the Apriori frequent pattern mining algorithm, FP-Growth is an frequent pattern mining algorithm that does not require candidate generation. Internally, it uses a so-called FP-tree (frequent pattern tree) datastrucure without generating the candidate sets explicitely, which makes is particularly attractive for large datasets.

In [21]:
freq_growth=fpgrowth(df_reduction, min_support=0.7, use_colnames=True)
freq_growth

Unnamed: 0,support,itemsets
0,1.0,(13)
1,1.0,(4)
2,1.0,(0)
3,0.885714,(9)
4,0.8,(12)
5,0.714286,(6)
6,1.0,"(4, 13)"
7,1.0,"(0, 4)"
8,1.0,"(0, 13)"
9,1.0,"(0, 4, 13)"


In [20]:
freq_growth=freq_growth.sort_values(by='itemsets')
freq_growth.tail()

Unnamed: 0,support,itemsets
12,0.885714,"(9, 13)"
13,0.885714,"(0, 9, 4)"
29,0.714286,"(0, 9, 12, 13)"
31,0.714286,"(0, 4, 9, 12, 13)"
38,0.714286,"(0, 4, 13, 6)"


### Make STREET DataFrame for frequent pattern

In [12]:
data2={
    'STREET':df['STREET'],
    'OFFENSE_CODE_GROUP':df['OFFENSE_CODE_GROUP'],

}
fq2=pd.DataFrame(data2,columns=['STREET','OFFENSE_CODE_GROUP'])
fq2=fq2.dropna(subset=['STREET'])
fq2

Unnamed: 0,STREET,OFFENSE_CODE_GROUP
0,115,12
1,58,46
2,779,40
3,441,0
4,2182,1
...,...,...
51980,2418,33
51981,1056,33
51982,1965,28
51983,2311,37


### Do the previous steps Frequent Pattern for STREET

In [13]:
fq2_unique=fq2.drop_duplicates(subset='STREET')
streets =fq2_unique['STREET'].to_list()
events =fq2.groupby('STREET')['OFFENSE_CODE_GROUP'].apply(list)


In [16]:
tew_ary = te.fit(events).transform(events)
street_freq = pd.DataFrame(tew_ary, columns=te.columns_)
street_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,True,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,...,False,True,False,False,False,False,False,False,False,False
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3001,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,True,True,False,False
3002,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3003,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3004,True,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,True,False,False


In [17]:
freq_growth=fpgrowth(street_freq, min_support=0.1, use_colnames=True)
freq_growth=freq_growth.sort_values(by='itemsets')
freq_growth.tail()

Unnamed: 0,support,itemsets
55,0.175316,"(40, 25)"
56,0.139055,"(40, 33, 37)"
57,0.142382,"(40, 25, 33)"
59,0.113107,"(40, 25, 37, 33)"
159,0.100466,"(33, 58, 26)"


In [18]:
freq_apriori=apriori(street_freq, min_support=0.1,verbose=0)
freq_apriori=freq_apriori.sort_values(by='itemsets')
freq_apriori.tail()

Unnamed: 0,support,itemsets
52,0.119428,"(28, 29)"
53,0.168663,"(33, 28)"
54,0.165003,"(28, 36)"
56,0.111444,"(28, 45)"
159,0.100798,"(33, 36, 54, 39)"


### Association rules generation from frequent itemsets

### Overview
Rule generation is a common task in the mining of frequent patterns. An association rule is an implication expression of the form X→Y
, where X
 and Y
 are disjoint itemsets [1]. A more concrete example based on consumer behaviour would be {Diapers}→{Beer}
 suggesting that people who buy diapers are also likely to buy beer. To evaluate the "interest" of such an association rule, different metrics have been developed. The current implementation make use of the confidence and lift metrics.

In [19]:
rules=association_rules(freq_apriori,metric='lift',min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(25, 28)",(39),0.153693,0.299069,0.107119,0.696970,2.330468,0.061154,2.313074,0.674580
1,"(25, 39)",(28),0.175316,0.275116,0.107119,0.611006,2.220899,0.058887,1.863481,0.666597
2,"(28, 39)",(25),0.144711,0.333666,0.107119,0.740230,2.218476,0.058834,2.565091,0.642169
3,(25),"(28, 39)",0.333666,0.144711,0.107119,0.321037,2.218476,0.058834,1.259699,0.824271
4,(28),"(25, 39)",0.275116,0.175316,0.107119,0.389359,2.220899,0.058887,1.350522,0.758373
...,...,...,...,...,...,...,...,...,...,...
581,"(54, 39)","(33, 36)",0.159681,0.221557,0.100798,0.631250,2.849155,0.065420,2.111032,0.772348
582,(33),"(36, 54, 39)",0.405522,0.120426,0.100798,0.248564,2.064046,0.051963,1.170525,0.867172
583,(36),"(33, 54, 39)",0.410845,0.126414,0.100798,0.245344,1.940801,0.048862,1.157595,0.822787
584,(54),"(33, 36, 39)",0.269128,0.139055,0.100798,0.374536,2.693437,0.063375,1.376491,0.860243


### Make HOUR DataFrame for frequent pattern

In [56]:

fq3=pd.DataFrame(df,columns=['HOUR','OFFENSE_CODE_GROUP'])
fq3=fq3.dropna(subset=['HOUR'])
fq3

Unnamed: 0,HOUR,OFFENSE_CODE_GROUP
0,20.0,12
1,20.0,46
2,19.0,40
3,20.0,0
4,20.0,1
...,...,...
51980,14.0,33
51981,13.0,33
51982,13.0,28
51983,14.0,37


In [286]:
fq3_unique=fq3.drop_duplicates(subset='HOUR')
hours =fq3_unique['HOUR'].to_list()
events =fq3.groupby('HOUR')['OFFENSE_CODE_GROUP'].apply(list)

### Do the previous steps Frequent Pattern for Hour

In [288]:
te_ary = te.fit(events).transform(events)
hour_freq = pd.DataFrame(te_ary, columns=te.columns_)
hour_freq

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,True,False,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True
1,True,False,True,True,True,True,True,False,True,True,...,True,True,False,False,True,True,True,True,True,True
2,True,False,True,True,True,True,True,False,True,True,...,True,True,True,False,True,True,True,True,True,True
3,True,False,False,True,True,True,True,False,True,True,...,True,True,False,False,True,True,True,True,True,True
4,True,False,True,True,True,True,True,False,True,True,...,True,True,True,False,True,True,True,True,True,True
5,True,False,False,True,True,False,True,False,True,True,...,True,True,True,False,True,True,True,True,True,True
6,True,False,False,True,True,True,False,False,True,True,...,True,True,True,True,True,True,True,True,True,True
7,True,False,True,True,True,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True
8,True,False,False,True,True,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True
9,True,False,True,True,True,True,True,False,True,True,...,True,True,True,True,True,True,True,True,True,True


## Data Reduction 

In [290]:
selected=hour_freq.columns[:15]
df_reduction=hour_freq[selected].copy()
df_reduction

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
0,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,False,True,True,True,True,True,False,True,True,True,False,True,True,False
2,True,False,True,True,True,True,True,False,True,True,True,True,True,True,False
3,True,False,False,True,True,True,True,False,True,True,True,False,True,True,False
4,True,False,True,True,True,True,True,False,True,True,True,False,True,True,False
5,True,False,False,True,True,False,True,False,True,True,False,False,True,True,False
6,True,False,False,True,True,True,False,False,True,True,True,True,True,True,False
7,True,False,True,True,True,True,True,False,True,True,True,True,True,True,False
8,True,False,False,True,True,True,True,False,True,True,True,False,True,True,True
9,True,False,True,True,True,True,True,False,True,True,True,True,True,True,True


In [291]:
freq_growth=fpgrowth(df_reduction, min_support=0.8, use_colnames=True)
freq_growth=freq_growth.sort_values(by='itemsets')
freq_growth.tail()

Unnamed: 0,support,itemsets
693,0.875,"(0, 3, 6, 8, 9, 13)"
692,0.875,"(0, 3, 6, 8, 9, 12)"
689,0.875,"(0, 3, 4, 6, 8, 9)"
705,0.875,"(8, 0, 3, 5)"
1022,0.833333,"(0, 3, 4, 5, 6, 8, 9, 10, 12, 13)"


### Association_rules

In [292]:
rules=association_rules(freq_growth,metric='lift',min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,"(0, 4, 5, 6)",(9),0.916667,1.000000,0.916667,1.000000,1.000000,0.000000,inf,0.0
1,"(0, 9, 5, 6)",(4),0.916667,1.000000,0.916667,1.000000,1.000000,0.000000,inf,0.0
2,"(9, 4, 5, 6)",(0),0.916667,1.000000,0.916667,1.000000,1.000000,0.000000,inf,0.0
3,"(0, 9, 4)","(5, 6)",1.000000,0.916667,0.916667,0.916667,1.000000,0.000000,1.000000,0.0
4,"(0, 5, 6)","(9, 4)",0.916667,1.000000,0.916667,1.000000,1.000000,0.000000,inf,0.0
...,...,...,...,...,...,...,...,...,...,...
26379,(5),"(0, 3, 4, 6, 8, 9, 10, 12, 13)",0.958333,0.833333,0.833333,0.869565,1.043478,0.034722,1.277778,1.0
26380,(9),"(0, 3, 4, 5, 6, 8, 10, 12, 13)",1.000000,0.833333,0.833333,0.833333,1.000000,0.000000,1.000000,0.0
26381,(10),"(0, 3, 4, 5, 6, 8, 9, 12, 13)",0.958333,0.833333,0.833333,0.869565,1.043478,0.034722,1.277778,1.0
26382,(12),"(0, 3, 4, 5, 6, 8, 9, 10, 13)",1.000000,0.833333,0.833333,0.833333,1.000000,0.000000,1.000000,0.0


### Year DataFrame

In [20]:

fq4=pd.DataFrame(df,columns=['YEAR','OFFENSE_CODE_GROUP'])
fq4=fq4.dropna(subset=['YEAR'])
fq4

Unnamed: 0,YEAR,OFFENSE_CODE_GROUP
0,2018.0,12
1,2018.0,46
2,2018.0,40
3,2018.0,0
4,2018.0,1
...,...,...
51980,2018.0,33
51981,2018.0,33
51982,2018.0,28
51983,2018.0,37


In [21]:
fq4_unique=fq4.drop_duplicates(subset='YEAR')
hours =fq4_unique['YEAR'].to_list()
events =fq4.groupby('YEAR')['OFFENSE_CODE_GROUP'].apply(list)
events

YEAR
2015.0    [22, 46, 22, 46, 40, 11, 40, 20, 28, 20, 20, 4...
2016.0    [22, 20, 46, 20, 10, 43, 20, 20, 20, 25, 25, 2...
2017.0    [22, 22, 40, 26, 46, 40, 20, 20, 57, 20, 40, 9...
2018.0    [12, 46, 40, 0, 1, 57, 37, 58, 55, 56, 37, 4, ...
Name: OFFENSE_CODE_GROUP, dtype: object

In [32]:
te_ary = te.fit(events).transform(events)
year_freq = pd.DataFrame(te_ary, columns=te.columns_)
year_freq


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
2,True,False,False,False,True,False,False,True,False,True,...,False,False,False,False,True,False,True,False,True,True
3,True,True,True,True,True,True,True,True,True,True,...,True,True,True,True,True,True,True,True,True,True


In [34]:
growth_freq=fpgrowth(year_freq, min_support=0.8, use_colnames=True)
growth_freq=growth_freq.sort_values(by='itemsets')
growth_freq.tail()

Unnamed: 0,support,itemsets
10,1.0,"(40, 46, 22)"
11,1.0,"(40, 20, 46)"
12,1.0,"(20, 46, 22)"
13,1.0,"(40, 20, 22)"
14,1.0,"(40, 20, 46, 22)"


In [35]:
year_apriori=apriori(year_freq, min_support=0.8,verbose=0)
year_apriori=year_apriori.sort_values(by='itemsets')
year_apriori.tail()

Unnamed: 0,support,itemsets
10,1.0,"(20, 22, 39)"
11,1.0,"(20, 45, 22)"
12,1.0,"(20, 45, 39)"
13,1.0,"(45, 22, 39)"
14,1.0,"(20, 45, 22, 39)"


### Association_rules

In [25]:
rules=association_rules(year_freq,metric='lift',min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(40),(46),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
1,(46),(40),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
2,(46),(22),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
3,(22),(46),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
4,(20),(46),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
5,(46),(20),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
6,(40),(22),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
7,(22),(40),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
8,(40),(20),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0
9,(20),(40),1.0,1.0,1.0,1.0,1.0,0.0,inf,0.0


## Frequent Pattern and Association rules for Shooting Attribute 

In [8]:
fq_sh=pd.DataFrame(df,columns=['HOUR','YEAR','OFFENSE_CODE_GROUP','SHOOTING'])
fq_sh=fq_sh.dropna(subset=['HOUR'])
fq_sh=fq_sh[fq_sh['SHOOTING']>0]
fq_sh

Unnamed: 0,HOUR,YEAR,OFFENSE_CODE_GROUP,SHOOTING
1052,21.0,2018.0,0,1
1053,21.0,2018.0,6,1
1074,19.0,2018.0,0,1
1907,20.0,2018.0,0,1
1908,20.0,2018.0,6,1
...,...,...,...,...
51099,14.0,2018.0,19,1
51100,14.0,2018.0,19,1
51101,14.0,2018.0,53,1
51102,14.0,2018.0,19,1


In [9]:
hours =fq_sh['YEAR'].to_list()
events =fq_sh.groupby('YEAR')['OFFENSE_CODE_GROUP'].apply(list)
events

YEAR
2018.0    [0, 6, 0, 0, 6, 0, 0, 19, 53, 19, 0, 24, 0, 0,...
Name: OFFENSE_CODE_GROUP, dtype: object

In [11]:
te = TransactionEncoder()
te_ary = te.fit(events).transform(events)
year_freq = pd.DataFrame(te_ary, columns=te.columns_)
year_freq

Unnamed: 0,0,6,13,19,24,25,26,33,40,52,53,56,57,60
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [13]:
hours =fq_sh['HOUR'].to_list()
events =fq_sh.groupby('HOUR')['OFFENSE_CODE_GROUP'].apply(list)
events

HOUR
0.0        [0, 24, 53, 56, 19, 19, 33, 0, 0, 0, 0, 26, 0]
1.0                                         [0, 0, 57, 0]
2.0                        [0, 0, 0, 0, 0, 33, 24, 6, 60]
3.0                                         [26, 0, 0, 6]
4.0                                              [24, 24]
5.0                             [26, 0, 0, 60, 19, 0, 60]
6.0                                       [0, 19, 53, 19]
9.0                                                  [24]
10.0                                             [33, 24]
11.0                                [60, 0, 60, 0, 0, 19]
12.0                           [0, 19, 0, 57, 19, 53, 19]
13.0                                               [0, 0]
14.0    [0, 0, 0, 13, 60, 0, 13, 60, 0, 19, 19, 53, 19...
15.0                 [24, 6, 0, 24, 60, 19, 40, 6, 0, 60]
16.0                             [0, 0, 19, 52, 0, 60, 0]
17.0                                                  [0]
18.0                                    [0, 25, 0, 53, 0]
19.0     

In [23]:
te_ary = te.fit(events).transform(events)
hour_freq = pd.DataFrame(te_ary, columns=te.columns_)
hour_freq

Unnamed: 0,0,6,13,19,24,25,26,33,40,52,53,56,57,60
0,True,False,False,True,True,False,True,True,False,False,True,True,False,False
1,True,False,False,False,False,False,False,False,False,False,False,False,True,False
2,True,True,False,False,True,False,False,True,False,False,False,False,False,True
3,True,True,False,False,False,False,True,False,False,False,False,False,False,False
4,False,False,False,False,True,False,False,False,False,False,False,False,False,False
5,True,False,False,True,False,False,True,False,False,False,False,False,False,True
6,True,False,False,True,False,False,False,False,False,False,True,False,False,False
7,False,False,False,False,True,False,False,False,False,False,False,False,False,False
8,False,False,False,False,True,False,False,True,False,False,False,False,False,False
9,True,False,False,True,False,False,False,False,False,False,False,False,False,True


In [17]:
growth_freq=fpgrowth(year_freq, min_support=0.5, use_colnames=True)
growth_freq=growth_freq.sort_values(by='itemsets')
growth_freq.tail()

Unnamed: 0,support,itemsets
0,0.863636,(0)
1,0.5,(19)
2,0.5,"(0, 19)"


In [20]:
Hour_apriori=apriori(year_freq, min_support=0.5,verbose=0)
Hour_apriori=Hour_apriori.sort_values(by='itemsets')
Hour_apriori.tail()

Unnamed: 0,support,itemsets
0,0.863636,(0)
1,0.5,(3)
2,0.5,"(0, 3)"


In [25]:
rules=association_rules(Hour_apriori,metric='lift',min_threshold=1)
rules

Unnamed: 0,antecedents,consequents,antecedent support,consequent support,support,confidence,lift,leverage,conviction,zhangs_metric
0,(0),(3),0.863636,0.5,0.5,0.578947,1.157895,0.068182,1.1875,1.0
1,(3),(0),0.5,0.863636,0.5,1.0,1.157895,0.068182,inf,0.272727
