In [8]:
import pandas as pd

In [9]:
def add_cols(df: pd.DataFrame, cols: list, init_val):
    for col in cols:
        df[str(col)] = init_val
        
    return df


def prefix_with_col(df: pd.DataFrame) -> pd.DataFrame:
    result = add_cols(pd.DataFrame(), df.columns, '')
    result['Num_Acc'] = df['Num_Acc']
    for col in df.columns:
        if str(col) == 'Num_Acc': continue
        result[str(col)] = df[col].map(lambda x: f'{col}__{convert_to_int(x)}')
            
    return result


In [10]:
data_folder = '../data/accidents-in-france-from-2005-to-2016/'

caracteristics = pd.read_csv(data_folder + 'caracteristics.csv', encoding='latin-1', low_memory=False)

In [11]:
caracteristics = caracteristics.drop(columns=['adr', 'an', 'lat',  'long', 'gps'])

In [12]:
caracteristics.head(3)

Unnamed: 0,Num_Acc,mois,jour,hrmn,lum,agg,int,atm,col,com,dep
0,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,590
1,201600000002,3,16,1800,1,2,6,1.0,6.0,5.0,590
2,201600000003,7,13,1900,1,1,1,1.0,6.0,11.0,590


In [13]:
users = pd.read_csv(data_folder + 'users.csv', encoding='latin-1', low_memory=False)

In [14]:
users = users.drop(columns=['num_veh'])

In [15]:
users.head(3)

Unnamed: 0,Num_Acc,place,catu,grav,sexe,trajet,secu,locp,actp,etatp,an_nais
0,201600000001,1.0,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0
1,201600000001,1.0,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0
2,201600000002,1.0,1,3,1,5.0,11.0,0.0,0.0,0.0,1960.0


In [16]:
vehicles = pd.read_csv(data_folder + 'vehicles.csv', encoding='latin-1', low_memory=False)

In [17]:
vehicles = vehicles.drop(columns=['num_veh'])

In [18]:
vehicles.head(3)

Unnamed: 0,Num_Acc,senc,catv,occutc,obs,obsm,choc,manv
0,201600000001,0.0,7,0,0.0,0.0,1.0,1.0
1,201600000001,0.0,2,0,0.0,0.0,7.0,15.0
2,201600000002,0.0,7,0,6.0,0.0,1.0,1.0


In [19]:
dataset = caracteristics.merge(users, on='Num_Acc')
dataset = dataset.merge(vehicles, on='Num_Acc')

In [20]:
dataset.head(3)

Unnamed: 0,Num_Acc,mois,jour,hrmn,lum,agg,int,atm,col,com,...,actp,etatp,an_nais,senc,catv,occutc,obs,obsm,choc,manv
0,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,...,0.0,0.0,1983.0,0.0,7,0,0.0,0.0,1.0,1.0
1,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,...,0.0,0.0,1983.0,0.0,2,0,0.0,0.0,7.0,15.0
2,201600000001,2,1,1445,1,2,1,8.0,3.0,5.0,...,0.0,0.0,2001.0,0.0,7,0,0.0,0.0,1.0,1.0


In [21]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3553976 entries, 0 to 3553975
Data columns (total 28 columns):
 #   Column   Dtype  
---  ------   -----  
 0   Num_Acc  int64  
 1   mois     int64  
 2   jour     int64  
 3   hrmn     int64  
 4   lum      int64  
 5   agg      int64  
 6   int      int64  
 7   atm      float64
 8   col      float64
 9   com      float64
 10  dep      int64  
 11  place    float64
 12  catu     int64  
 13  grav     int64  
 14  sexe     int64  
 15  trajet   float64
 16  secu     float64
 17  locp     float64
 18  actp     float64
 19  etatp    float64
 20  an_nais  float64
 21  senc     float64
 22  catv     int64  
 23  occutc   int64  
 24  obs      float64
 25  obsm     float64
 26  choc     float64
 27  manv     float64
dtypes: float64(15), int64(13)
memory usage: 786.3 MB


In [22]:
caracteristics = None
users = None
vehicles = None

In [23]:
def convert_to_int(val):
    try:
        return int(val)
    except ValueError:
        return -1

    
def get_unique_id(unique_ids: dict, col_name: str, val):
    k = f'{col_name}__{convert_to_int(val)}'
    
    if k not in unique_ids.keys():
            unique_ids[k] = len(unique_ids.keys())
            
    return unique_ids[k]
    

def transform_to_transaction(df: pd.DataFrame):
    unique_ids = {}
    
    result = add_cols(pd.DataFrame(), df.columns, -1)
    result['Num_Acc'] = df['Num_Acc']
    
    for col in df.columns:
        if str(col) == 'Num_Acc': continue
            
        result[str(col)] = df[col].map(lambda x: get_unique_id(unique_ids, str(col), x))
            
    return result, unique_ids
    
    

In [24]:
dataset_tr, unique_ids = transform_to_transaction(dataset)

In [25]:
len(unique_ids.keys())

2957

In [26]:
dataset_tr.head(3)

Unnamed: 0,Num_Acc,mois,jour,hrmn,lum,agg,int,atm,col,com,...,actp,etatp,an_nais,senc,catv,occutc,obs,obsm,choc,manv
0,201600000001,0,12,43,1482,1487,1489,1499,1509,1517,...,2607,2616,2621,2743,2747,2780,2894,2912,2920,2931
1,201600000001,0,12,43,1482,1487,1489,1499,1509,1517,...,2607,2616,2621,2743,2748,2780,2894,2912,2921,2932
2,201600000001,0,12,43,1482,1487,1489,1499,1509,1517,...,2607,2616,2622,2743,2747,2780,2894,2912,2920,2931


In [27]:
from mlxtend.preprocessing import TransactionEncoder
from mlxtend.frequent_patterns import apriori

te = TransactionEncoder()

In [28]:
dataset_values = dataset_tr.values

In [29]:
dataset_values = dataset_values[:, 1:] # remove Num_Acc

In [30]:
sub_sample_dataset = dataset_values[:50000]

In [31]:
te_ary = te.fit(sub_sample_dataset).transform(sub_sample_dataset)
df = pd.DataFrame(te_ary, columns=te.columns_)

In [32]:
freq_items = apriori(df, min_support=0.5, use_colnames=True,verbose=1, low_memory=True)
freq_items.head()

Processing 18 combinations | Sampling itemset size 875


Unnamed: 0,support,itemsets
0,0.73918,(1482)
1,0.68026,(1488)
2,0.797,(1489)
3,0.75308,(1500)
4,0.73888,(2543)


In [33]:
from mlxtend.frequent_patterns import association_rules, fpmax

In [34]:
closed_itemset = fpmax (df, min_support=0.6, use_colnames=True)
closed_itemset.head()

Unnamed: 0,support,itemsets
0,0.65102,"(2597, 2572, 2607, 2616, 2780)"
1,0.64966,"(2563, 2597, 2607, 2616, 2780)"
2,0.64564,"(2597, 2607, 2616, 2747, 2780)"
3,0.65366,"(2914, 2597, 2894, 2607, 2616, 2780)"
4,0.66854,"(2597, 2607, 1488, 2616, 2780)"


In [35]:
id_inv_map = {v: k for k, v in unique_ids.items()}

In [36]:
def decompose_key(k:str):
    return k.split('__')[0], int(k.split('__')[1])

In [37]:
res = []
for i, itemset in closed_itemset.iterrows():
    l = [itemset['support']]
    for i in itemset['itemsets']:
        l.append(decompose_key(id_inv_map[i]))
    
    res.append(l)
    
res

[[0.65102,
  ('locp', 0),
  ('secu', 11),
  ('actp', 0),
  ('etatp', 0),
  ('occutc', 0)],
 [0.64966, ('sexe', 1), ('locp', 0), ('actp', 0), ('etatp', 0), ('occutc', 0)],
 [0.64564, ('locp', 0), ('actp', 0), ('etatp', 0), ('catv', 7), ('occutc', 0)],
 [0.65366,
  ('obsm', 2),
  ('locp', 0),
  ('obs', 0),
  ('actp', 0),
  ('etatp', 0),
  ('occutc', 0)],
 [0.66854, ('locp', 0), ('actp', 0), ('agg', 1), ('etatp', 0), ('occutc', 0)],
 [0.65098,
  ('locp', 0),
  ('obs', 0),
  ('place', 1),
  ('actp', 0),
  ('etatp', 0),
  ('catu', 1),
  ('occutc', 0)],
 [0.64448,
  ('locp', 0),
  ('lum', 1),
  ('obs', 0),
  ('actp', 0),
  ('etatp', 0),
  ('occutc', 0)],
 [0.63162,
  ('locp', 0),
  ('obs', 0),
  ('actp', 0),
  ('etatp', 0),
  ('atm', 1),
  ('occutc', 0)],
 [0.6575,
  ('locp', 0),
  ('obs', 0),
  ('actp', 0),
  ('int', 1),
  ('etatp', 0),
  ('occutc', 0)]]

In [59]:
def decrypt(transactionsList):
    cmp = 0
    f = open("frequent_sets.txt", "w")
    
    for itemsetList in transactionsList:
        f.write("-----------------ITEM SET----------------------\n")
        for item in itemsetList:
            
            print(item)
            if cmp != 0:
                colonne = item[0]
                valeur = item[1]
                if colonne == "lum":
                    if valeur == 1:
                        f.write(colonne+" : Full day")
                    elif valeur == 2:
                        f.write(colonne+" : Twilight or dawn")
                    elif valeur == 3:
                        f.write(colonne+" : Night without public lighting")
                    elif valeur == 4:
                        f.write(colonne+" :  Night with public lighting not lit")
                    elif valeur == 5:
                        f.write(colonne+" : Night with public lighting on")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "agg":
                    if valeur == 1:
                        f.write(colonne+" : Out of agglomeration")
                    elif valeur == 2:
                        f.write(colonne+" : In built-up areas")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "int":
                    if valeur == 1:
                        f.write(colonne+" : Out of intersection")
                    elif valeur == 2:
                        f.write(colonne+" : Intersection in X")
                    elif valeur == 3:
                        f.write(colonne+" : Intersection in T")
                    elif valeur == 4:
                        f.write(colonne+" : Intersection in Y")
                    elif valeur == 5:
                        f.write(colonne+" : Intersection with more than 4 branches")
                    elif valeur == 6:
                        f.write(colonne+" : Giratory")
                    elif valeur == 7:
                        f.write(colonne+" : Place")
                    elif valeur == 8:
                        f.write(colonne+" : Level crossing")
                    elif valeur == 9:
                        f.write(colonne+" : Other intersection")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "atm":
                    if valeur == 1:
                        f.write(colonne+" : Normal")
                    elif valeur == 2:
                        f.write(colonne+" : Light rain")
                    elif valeur == 3:
                        f.write(colonne+" : Heavy rain")
                    elif valeur == 4:
                        f.write(colonne+" : Snow - hail")
                    elif valeur == 5:
                        f.write(colonne+" : Fog - smoke")
                    elif valeur == 6:
                        f.write(colonne+" : Strong wind - storm")
                    elif valeur == 7:
                        f.write(colonne+" : Dazzling weather")
                    elif valeur == 8:
                        f.write(colonne+" : Cloudy weather")
                    elif valeur == 9:
                        f.write(colonne+" : Other")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "col":
                    if valeur == 1:
                        f.write(colonne+" : Two vehicles - frontal")
                    elif valeur == 2:
                        f.write(colonne+" : Two vehicles - from the rear")
                    elif valeur == 3:
                        f.write(colonne+" : Two vehicles - by the side")
                    elif valeur == 4:
                        f.write(colonne+" : Three vehicles and more - in chain")
                    elif valeur == 5:
                        f.write(colonne+" : Three or more vehicles - multiple collisions")
                    elif valeur == 6:
                        f.write(colonne+" : Other collision")
                    elif valeur == 7:
                        f.write(colonne+" : Without collision")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "catr":
                    if valeur == 1:
                        f.write(colonne+" : Highway")
                    elif valeur == 2:
                        f.write(colonne+" : National Road")
                    elif valeur == 3:
                        f.write(colonne+" : Departmental Road")
                    elif valeur == 4:
                        f.write(colonne+" : Communal Way")
                    elif valeur == 5:
                        f.write(colonne+" : Off public network")
                    elif valeur == 6:
                        f.write(colonne+" : Parking lot open to public traffic")
                    elif valeur == 9:
                        f.write(colonne+" : other")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")

                elif colonne == "circ":
                    if valeur == 1:
                        f.write(colonne+" : One way")
                    elif valeur == 2:
                        f.write(colonne+" : Bidirectional")
                    elif valeur == 3:
                        f.write(colonne+" : Separated carriageways")
                    elif valeur == 4:
                        f.write(colonne+" : With variable assignment channels")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "vosp":
                    if valeur == 1:
                        f.write(colonne+" : Bike path")
                    elif valeur == 2:
                        f.write(colonne+" : Cycle Bank")
                    elif valeur == 3:
                        f.write(colonne+" : Reserved channel")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "Prof":
                    if valeur == 1:
                        f.write(colonne+" : Dish")
                    elif valeur == 2:
                        f.write(colonne+" : Slope")
                    elif valeur == 3:
                        f.write(colonne+" : Hilltop")
                    elif valeur == 4:
                        f.write(colonne+" : Hill bottom")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")

                elif colonne == "plan":
                    if valeur == 1:
                        f.write(colonne+" : Straight part")
                    elif valeur == 2:
                        f.write(colonne+" : Curved on the left")
                    elif valeur == 3:
                        f.write(colonne+" : Curved right")
                    elif valeur == 4:
                        f.write(colonne+" : In \"S")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")      

                elif colonne == "surf":
                    if valeur == 1:
                        f.write(colonne+" : normal")
                    elif valeur == 2:
                        f.write(colonne+" : wet")
                    elif valeur == 3:
                        f.write(colonne+" : puddles")
                    elif valeur == 4:
                        f.write(colonne+" : flooded")
                    elif valeur == 5:
                        f.write(colonne+" : snow")
                    elif valeur == 6:
                        f.write(colonne+" : mud")
                    elif valeur == 7:
                        f.write(colonne+" : icy")
                    elif valeur == 8:
                        f.write(colonne+" : fat - oil")
                    elif valeur == 9:
                        f.write(colonne+" : other")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")                
                elif colonne == "infra":
                    if valeur == 1:
                        f.write(colonne+" : Underground - tunnel")
                    elif valeur == 2:
                        f.write(colonne+" : Bridge - autopont")
                    elif valeur == 3:
                        f.write(colonne+" : Exchanger or connection brace")
                    elif valeur == 4:
                        f.write(colonne+" : Railway")
                    elif valeur == 5:
                        f.write(colonne+" : Carrefour arranged")
                    elif valeur == 6:
                        f.write(colonne+" : Pedestrian area")
                    elif valeur == 7:
                        f.write(colonne+" : Toll zone")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")   
                elif colonne == "situ":
                    if valeur == 1:
                        f.write(colonne+" : On the road")
                    elif valeur == 2:
                        f.write(colonne+" : On emergency stop band")
                    elif valeur == 3:
                        f.write(colonne+" : On the verge")
                    elif valeur == 4:
                        f.write(colonne+" : On the sidewalk")
                    elif valeur == 5:
                        f.write(colonne+" : On bike path")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")    
                elif colonne == "catu":
                    if valeur == 1:
                        f.write(colonne+" : Driver")
                    elif valeur == 2:
                        f.write(colonne+" : Passenger")
                    elif valeur == 3:
                        f.write(colonne+" : Pedestrian")
                    elif valeur == 4:
                        f.write(colonne+" : Pedestrian in rollerblade or scooter")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")                 

                elif colonne == "grav":
                    if valeur == 1:
                        f.write(colonne+" : Unscathed")
                    elif valeur == 2:
                        f.write(colonne+" : Killed")
                    elif valeur == 3:
                        f.write(colonne+" : Hospitalized wounded")
                    elif valeur == 4:
                        f.write(colonne+" : Light injury")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "sex":
                    if valeur == 1:
                        f.write(colonne+" : Male")
                    elif valeur == 2:
                        f.write(colonne+" : Female")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "trip":
                    if valeur == 1:
                        f.write(colonne+" : Home - work")
                    elif valeur == 2:
                        f.write(colonne+" : Home - school")
                    elif valeur == 3:
                        f.write(colonne+" : Shopping - Shopping")
                    elif valeur == 4:
                        f.write(colonne+" : Professional use")
                    elif valeur == 5:
                        f.write(colonne+" : Promenade - leisure")
                    elif valeur == 9:
                        f.write(colonne+" : other")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "secu":
                    val = ""
                    if int(str(valeur)[:1]) == 1:
                        val += "Belt"
                    elif int(str(valeur)[:1]) == 2:
                        val +="Helmet"
                    elif int(str(valeur)[:1]) == 3:
                        val +="Children's device"
                    elif int(str(valeur)[:1]) == 4:
                        val +="Reflective equipment"
                    elif int(str(valeur)[:1]) == 9:
                        val += "other"
                    elif int(str(valeur)[:2]) == 1:
                        val +=", Yes"
                    elif int(str(valeur)[:2]) == 2:
                        val +=", No"
                    elif int(str(valeur)[:2]) == 3:
                        val +=", Not determinable"
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                    f.write(colonne+" : "+ val)
                elif colonne == "locp":
                    print(valeur)
                    if valeur == 1:
                        f.write(colonne+" : On pavement: A + 50 m from the pedestrian crossing")
                    elif valeur == 2:
                        f.write(colonne+" : On pavement: A - 50 m from the pedestrian crossing")
                    elif valeur == 3:
                        f.write(colonne+" : On pedestrian crossing Without light signaling")
                    elif valeur == 4:
                        f.write(colonne+" : On pedestrian crossing With light signaling")
                    elif valeur == 5:
                        f.write(colonne+" : On the sidewalk")
                    elif valeur == 6:
                        f.write(colonne+" : On the verge")
                    elif valeur == 7:
                        f.write(colonne+" : On refuge or BAU")
                    elif valeur == 8:
                        f.write(colonne+" : On against aisle")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "actp":
                    if valeur == 0:
                        f.write(colonne+" : Moving, not specified or not applicable")
                    elif valeur == 1:
                        f.write(colonne+" : Moving, Meaning bumping vehicle")
                    elif valeur == 2:
                        f.write(colonne+" : Moving, Opposite direction of the vehicle")
                    elif valeur == 3:
                        f.write(colonne+" : Various, Crossing")
                    elif valeur == 4:
                        f.write(colonne+" : Various, Masked")
                    elif valeur == 5:
                        f.write(colonne+" : Various, Playing - running")
                    elif valeur == 6:
                        f.write(colonne+" : Various, With animal")
                    elif valeur == 9:
                        f.write(colonne+" : Various, Other")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "etatp":
                    if valeur == 1:
                        f.write(colonne+" : Only")
                    elif valeur == 2:
                        f.write(colonne+" : Accompanied")
                    elif valeur == 3:
                        f.write(colonne+" : In a group")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "GP":
                    if valeur == 1:
                        f.write(colonne+" : PK or PR or increasing postal address number")
                    elif valeur == 2:
                        f.write(colonne+" : PK or PR or descending postal address number")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                elif colonne == "CATV":
                    if valeur == 1:
                        f.write(colonne+" : Bicycle")
                    elif valeur == 2:
                        f.write(colonne+" : Moped <50cm3")
                    elif valeur == 3:
                        f.write(colonne+" : Cart (Quadricycle with bodied motor) (formerly \"cart or motor tricycle\")")
                    elif valeur == 4:
                        f.write(colonne+" : Not used since 2006 (registered scooter)")
                    elif valeur == 5:
                        f.write(colonne+" : Not used since 2006 (motorcycle)")
                    elif valeur == 6:
                        f.write(colonne+" : Not used since 2006 (side-car)")
                    elif valeur == 7:
                        f.write(colonne+" : VL only")
                    elif valeur == 8:
                        f.write(colonne+" : Not used category (VL + caravan)")
                    elif valeur == 9:
                        f.write(colonne+" : Not used category (VL + trailer)")
                    elif valeur == 10:
                        f.write(colonne+" : VU only 1,5T <= GVW <= 3,5T with or without trailer (formerly VU only 1,5T <= GVW <= 3,5T)")
                    elif valeur == 11:
                        f.write(colonne+" : Most used since 2006 (VU (10) + caravan)")
                    elif valeur == 12:
                        f.write(colonne+" : Most used since 2006 (VU (10) + trailer)")
                    elif valeur == 13:
                        f.write(colonne+" : PL only 3,5T")
                    else: 
                        f.write(colonne+" : VALEUR INCONNUE")
                else:
                    f.write(colonne+" : "+str(valeur))
                f.write("\n")
            cmp +=1
        cmp = 0

In [60]:
decrypt(res)

0.65102
('locp', 0)
0
('secu', 11)
('actp', 0)
('etatp', 0)
('occutc', 0)
0.64966
('sexe', 1)
('locp', 0)
0
('actp', 0)
('etatp', 0)
('occutc', 0)
0.64564
('locp', 0)
0
('actp', 0)
('etatp', 0)
('catv', 7)
('occutc', 0)
0.65366
('obsm', 2)
('locp', 0)
0
('obs', 0)
('actp', 0)
('etatp', 0)
('occutc', 0)
0.66854
('locp', 0)
0
('actp', 0)
('agg', 1)
('etatp', 0)
('occutc', 0)
0.65098
('locp', 0)
0
('obs', 0)
('place', 1)
('actp', 0)
('etatp', 0)
('catu', 1)
('occutc', 0)
0.64448
('locp', 0)
0
('lum', 1)
('obs', 0)
('actp', 0)
('etatp', 0)
('occutc', 0)
0.63162
('locp', 0)
0
('obs', 0)
('actp', 0)
('etatp', 0)
('atm', 1)
('occutc', 0)
0.6575
('locp', 0)
0
('obs', 0)
('actp', 0)
('int', 1)
('etatp', 0)
('occutc', 0)
