In [1]:
import pandas as pd

In [2]:
def add_cols(df: pd.DataFrame, cols: list, init_val):
    for col in cols:
        df[str(col)] = init_val
        
    return df


def prefix_with_col(df: pd.DataFrame) -> pd.DataFrame:
    result = add_cols(pd.DataFrame(), df.columns, '')
    result['Num_Acc'] = df['Num_Acc']
    for col in df.columns:
        if str(col) == 'Num_Acc': continue
        result[str(col)] = df[col].map(lambda x: f'{col}__{convert_to_int(x)}')
            
    return result


In [4]:
data_folder = '../data/accidents-in-france-from-2005-to-2016/'

caracteristics = pd.read_csv(data_folder + 'caracteristics.csv', encoding='latin-1', low_memory=False)

In [5]:
caracteristics = caracteristics.drop(columns=['adr', 'an', 'lat',  'long', 'gps'])

In [6]:
caracteristics.head(3)

Unnamed: 0,Num_Acc,mois,jour,hrmn,lum,agg,int,atm,col,com,dep
0,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,590
1,201600000002,3,16,1800,1,2,6,1.0,6.0,5.0,590
2,201600000003,7,13,1900,1,1,1,1.0,6.0,11.0,590


In [7]:
users = pd.read_csv(data_folder + 'users.csv', encoding='latin-1', low_memory=False)

In [8]:
users = users.drop(columns=['num_veh'])

In [9]:
users.head(3)

Unnamed: 0,Num_Acc,place,catu,grav,sexe,trajet,secu,locp,actp,etatp,an_nais
0,201600000001,1.0,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0
1,201600000001,1.0,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0
2,201600000002,1.0,1,3,1,5.0,11.0,0.0,0.0,0.0,1960.0


In [10]:
vehicles = pd.read_csv(data_folder + 'vehicles.csv', encoding='latin-1', low_memory=False)

In [12]:
vehicles = vehicles.drop(columns=['num_veh'])

In [14]:
vehicles.head(3)

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv
0,201600000001,0.0,7,0,0.0,0.0,1.0,1.0
1,201600000001,0.0,2,0,0.0,0.0,7.0,15.0
2,201600000002,0.0,7,0,6.0,0.0,1.0,1.0


In [16]:
dataset = caracteristics.merge(users, on='Num_Acc')
dataset = dataset.merge(vehicles, on='Num_Acc')

In [17]:
dataset.head(3)

Unnamed: 0,Num_Acc,mois,jour,hrmn,lum,agg,int,atm,col,com,...,actp,etatp,an_nais,senc,catv,occutc,obs,obsm,choc,manv
0,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,...,0.0,0.0,1983.0,0.0,7,0,0.0,0.0,1.0,1.0
1,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,...,0.0,0.0,1983.0,0.0,2,0,0.0,0.0,7.0,15.0
2,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,...,0.0,0.0,2001.0,0.0,7,0,0.0,0.0,1.0,1.0


In [19]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3553976 entries, 0 to 3553975
Data columns (total 28 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Num_Acc  int64  
 1   mois     int64  
 2   jour     int64  
 3   hrmn     int64  
 4   lum      int64  
 5   agg      int64  
 6   int      int64  
 7   atm      float64
 8   col      float64
 9   com      float64
 10  dep      int64  
 11  place    float64
 12  catu     int64  
 13  grav     int64  
 14  sexe     int64  
 15  trajet   float64
 16  secu     float64
 17  locp     float64
 18  actp     float64
 19  etatp    float64
 20  an_nais  float64
 21  senc     float64
 22  catv     int64  
 23  occutc   int64  
 24  obs      float64
 25  obsm     float64
 26  choc     float64
 27  manv     float64
dtypes: float64(15), int64(13)
memory usage: 786.3 MB


In [18]:
caracteristics = None
users = None
vehicles = None

In [26]:
def convert_to_int(val):
    try:
        return int(val)
    except ValueError:
        return -1

    
def get_unique_id(unique_ids: dict, col_name: str, val):
    k = f'{col_name}__{convert_to_int(val)}'
    
    if k not in unique_ids.keys():
            unique_ids[k] = len(unique_ids.keys())
            
    return unique_ids[k]
    

def transform_to_transaction(df: pd.DataFrame):
    unique_ids = {}
    
    result = add_cols(pd.DataFrame(), df.columns, -1)
    result['Num_Acc'] = df['Num_Acc']
    
    for col in df.columns:
        if str(col) == 'Num_Acc': continue
            
        result[str(col)] = df[col].map(lambda x: get_unique_id(unique_ids, str(col), x))
            
    return result, unique_ids
    
    

In [28]:
dataset_tr, unique_ids = transform_to_transaction(dataset)

In [32]:
len(unique_ids.keys())

2957

In [38]:
dataset_tr.head(3)

Unnamed: 0,Num_Acc,mois,jour,hrmn,lum,agg,int,atm,col,com,...,actp,etatp,an_nais,senc,catv,occutc,obs,obsm,choc,manv
0,201600000001,0,12,43,1482,1487,1489,1499,1509,1517,...,2607,2616,2621,2743,2747,2780,2894,2912,2920,2931
1,201600000001,0,12,43,1482,1487,1489,1499,1509,1517,...,2607,2616,2621,2743,2748,2780,2894,2912,2921,2932
2,201600000001,0,12,43,1482,1487,1489,1499,1509,1517,...,2607,2616,2622,2743,2747,2780,2894,2912,2920,2931


In [33]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

te = TransactionEncoder()

In [34]:
dataset_values = dataset_tr.values

In [55]:
dataset_values = dataset_values[:, 1:] # remove Num_Acc

In [56]:
sub_sample_dataset = dataset_values[:50000]

In [57]:
te_ary = te.fit(sub_sample_dataset).transform(sub_sample_dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [None]:
freq_items = apriori(df, min_support=0.5, use_colnames=True,verbose=1, low_memory=True)
freq_items.head()

In [48]:
from mlxtend.frequent_patterns import association_rules, fpmax

In [76]:
closed_itemset = fpmax (df, min_support=0.6, use_colnames=True)
closed_itemset.head()

Unnamed: 0,support,itemsets
0,0.65102,"(2597, 2572, 2607, 2616, 2780)"
1,0.64966,"(2563, 2597, 2607, 2616, 2780)"
2,0.64564,"(2597, 2607, 2616, 2747, 2780)"
3,0.65366,"(2914, 2597, 2894, 2607, 2616, 2780)"
4,0.66854,"(2597, 2607, 1488, 2616, 2780)"


In [60]:
id_inv_map = {v: k for k, v in unique_ids.items()}

In [79]:
def decompose_key(k:str):
    return k.split('__')[0], int(k.split('__')[1])

In [82]:
res = []
for i, itemset in closed_itemset.iterrows():
    l = [itemset['support']]
    for i in itemset['itemsets']:
        l.append(decompose_key(id_inv_map[i]))
    
    res.append(l)
    
res

[[0.65102,
  ['locp', 0],
  ['secu', 11],
  ['actp', 0],
  ['etatp', 0],
  ['occutc', 0]],
 [0.64966, ['sexe', 1], ['locp', 0], ['actp', 0], ['etatp', 0], ['occutc', 0]],
 [0.64564, ['locp', 0], ['actp', 0], ['etatp', 0], ['catv', 7], ['occutc', 0]],
 [0.65366,
  ['obsm', 2],
  ['locp', 0],
  ['obs', 0],
  ['actp', 0],
  ['etatp', 0],
  ['occutc', 0]],
 [0.66854, ['locp', 0], ['actp', 0], ['agg', 1], ['etatp', 0], ['occutc', 0]],
 [0.65098,
  ['locp', 0],
  ['obs', 0],
  ['place', 1],
  ['actp', 0],
  ['etatp', 0],
  ['catu', 1],
  ['occutc', 0]],
 [0.64448,
  ['locp', 0],
  ['lum', 1],
  ['obs', 0],
  ['actp', 0],
  ['etatp', 0],
  ['occutc', 0]],
 [0.63162,
  ['occutc', 0],
  ['locp', 0],
  ['obs', 0],
  ['actp', 0],
  ['etatp', 0],
  ['atm', 1]],
 [0.6575,
  ['locp', 0],
  ['obs', 0],
  ['actp', 0],
  ['int', 1],
  ['etatp', 0],
  ['occutc', 0]]]