In [1]:
import pandas as pd
import numpy as np
import random
import json

## Prétraitement des données

In [2]:
def add_cols(df: pd.DataFrame, cols: list, init_val):
    for col in cols:
        df[str(col)] = init_val
    return df


def prefix_with_col(df: pd.DataFrame) -> pd.DataFrame:
    result = add_cols(pd.DataFrame(), df.columns, '')
    result['Num_Acc'] = df['Num_Acc']
    for col in df.columns:
        if str(col) == 'Num_Acc': continue
        result[str(col)] = df[col].map(lambda x: f'{col}__{convert_to_int(x)}')
            
    return result

In [3]:
data_folder = '../data/accidents-in-france-from-2005-to-2016/'

caracteristics = pd.read_csv(data_folder + 'caracteristics.csv', encoding='latin-1')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
caracteristics = caracteristics.drop(columns=['adr', 'lat',  'long', 'gps'])

In [5]:
pd.unique(caracteristics['an'])

array([16, 15, 14, 13, 12, 11, 10,  9,  8,  7,  6,  5])

In [6]:
users = pd.read_csv(data_folder + 'users.csv', encoding='latin-1')

In [7]:
users = users.drop(columns=['num_veh'])

In [8]:
users.head(3)

Unnamed: 0,Num_Acc,place,catu,grav,sexe,trajet,secu,locp,actp,etatp,an_nais
0,201600000001,1.0,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0
1,201600000001,1.0,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0
2,201600000002,1.0,1,3,1,5.0,11.0,0.0,0.0,0.0,1960.0


In [9]:
vehicles = pd.read_csv(data_folder + 'vehicles.csv', encoding='latin-1')

In [10]:
vehicles = vehicles.drop(columns=['num_veh', 'senc', 'occutc', 'obs', 'obsm', 'manv', 'choc'])

In [11]:
vehicles.head(3)

Unnamed: 0,Num_Acc,catv
0,201600000001,7
1,201600000001,2
2,201600000002,7


pour analyser plus finement les données et pour augmenter les nombre les patterns on peut faire certaines pretraitemnt. notament transformer les annnée de naissance en tranche d'age. par exemple tranche d'age entre 18-25, 26-35, 36-50, 51-*.

In [12]:
print(caracteristics.shape)
print(users.shape)

dataset = caracteristics.merge(users, on='Num_Acc')
print(dataset.shape)
dataset = dataset.merge(vehicles, on='Num_Acc')
dataset.shape

(839985, 12)
(1876005, 11)
(1876005, 22)


(3553976, 23)

In [14]:
dataset.head(3)

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,catu,grav,sexe,trajet,secu,locp,actp,etatp,an_nais,catv
0,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,7
1,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,2
2,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,3,1,9.0,21.0,0.0,0.0,0.0,2001.0,7


In [16]:
caracteristics = None
users = None
vehicles = None

In [17]:
def convert_to_int(val):
    try:
        return int(val)
    except ValueError:
        return -1

In [18]:
age_ranges = {
    1: (0, 12),
    2: (13, 17),
    3: (18, 25),
    4: (26, 35),
    5: (36, 50),
    6: (51, 110)
}

In [19]:
def get_age_range(age_ranges, age):
    for a in age_ranges:
        if a[0] <= age <= a[1]:
            return age_ranges[a]

    raise ValueError(f'invalide age ranges {age}')

def user_age_range(dataset: pd.DataFrame, age_ranges: dict):
    
    ar = {v: k for k, v in age_ranges.items()}
    
    dataset['age_range'] = 0
    
    for i in dataset.index:
        an = convert_to_int(dataset.at[i, 'an'])
        an_nais = convert_to_int(dataset.at[i, 'an_nais'])
        if an == -1 or an_nais == -1: 
            continue
    
        age = (2000 + an) - an_nais
        dataset.at[i, "age_range"] = get_age_range(ar, age)


In [20]:
user_age_range(dataset, age_ranges)

In [21]:
dataset.head(3)

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,grav,sexe,trajet,secu,locp,actp,etatp,an_nais,catv,age_range
0,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,7,4
1,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,1,2,0.0,11.0,0.0,0.0,0.0,1983.0,2,4
2,201600000001,16,2,1,1445,1,2,1,8.0,3.0,...,3,1,9.0,21.0,0.0,0.0,0.0,2001.0,7,2


In [22]:
f = open('frequent_info.json', 'r')
data_range = json.load(f)

def key_gen(col_name, val):
    return f'{col_name}__{val}'

def get_unique_id(unique_ids: dict, col_name: str, val):

    vi = convert_to_int(val)

    if col_name in data_range:
        r = data_range[col_name]['range']
        if not (r[0] <= vi <= r[1]):
            vi = -1

    k = key_gen(col_name, vi)

    if k not in unique_ids.keys():
        unique_ids[k] = len(unique_ids.keys())

    return unique_ids[k]


def transform_to_transaction(df: pd.DataFrame):
    unique_ids = {}

    result = add_cols(pd.DataFrame(), df.columns, -1)
    result['Num_Acc'] = df['Num_Acc']

    for col in df.columns:
        if str(col) == 'Num_Acc': continue

        result[str(col)] = df[col].map(lambda x: get_unique_id(unique_ids, str(col), x))

    return result, unique_ids

In [23]:
dataset_tr, unique_ids = transform_to_transaction(dataset)

In [24]:
dataset = None

In [25]:
len(unique_ids.keys())

2752

In [26]:
dataset_tr.head(3)

Unnamed: 0,Num_Acc,an,mois,jour,hrmn,lum,agg,int,atm,col,...,grav,sexe,trajet,secu,locp,actp,etatp,an_nais,catv,age_range
0,201600000001,0,12,24,55,1494,1499,1501,1511,1521,...,2570,2574,2576,2583,2587,2596,2605,2609,2731,2745
1,201600000001,0,12,24,55,1494,1499,1501,1511,1521,...,2570,2574,2576,2583,2587,2596,2605,2609,2732,2745
2,201600000001,0,12,24,55,1494,1499,1501,1511,1521,...,2571,2575,2577,2583,2587,2596,2605,2610,2731,2746


In [27]:
dataset_values = dataset_tr.values

In [28]:
dataset_values = dataset_values[:, 1:] # remove Num_Acc

In [29]:
dataset_values[:2]

array([[   0,   12,   24,   55, 1494, 1499, 1501, 1511, 1521, 1529, 2454,
        2555, 2566, 2570, 2574, 2576, 2583, 2587, 2596, 2605, 2609, 2731,
        2745],
       [   0,   12,   24,   55, 1494, 1499, 1501, 1511, 1521, 1529, 2454,
        2555, 2566, 2570, 2574, 2576, 2583, 2587, 2596, 2605, 2609, 2732,
        2745]])

In [30]:
np.savez_compressed('dataset_values.npz', dataset_values)

In [38]:
id_inv_map = { v: k for k, v in unique_ids.items() }
with open('id_inv_map.json', 'w') as f:
    json.dump(id_inv_map, f)


original_cols = list(dataset_tr.columns)
with open('original_cols.json', 'w') as f:
    json.dump(original_cols, f)

In [38]:
df.to_csv('bin_db.csv.gz', header=True, compression='gzip', index=False)

In [71]:
def calculate_cols_proba(binary_df: pd.DataFrame):
    cols = binary_df.columns
    probabilities = np.zeros(len(cols), dtype=np.float)
    for i, col in enumerate(cols):
        probabilities[i] = np.sum(binary_df[col])
        
    probabilities /= np.sum(probabilities)
    
    return np.array(cols), probabilities

def calculate_rows_proba(binary_df: pd.DataFrame):
    lens = np.zeros(binary_df.shape[0], dtype=np.uint16)
    c = 0
    for i in binary_df.index:
        if i % 200000 == 0:
            print(i)
        lens[c] = np.sum(df.iloc[i])
        c += 1
        
    return lens

In [72]:
rows_lens = calculate_rows_proba(df)

0
200000
400000
600000
800000
1000000
1200000
1400000
1600000
1800000
2000000
2200000
2400000
2600000
2800000
3000000
3200000
3400000


In [73]:
np.savez_compressed('rows_probas.npz', rows_lens)