# Data Cleaning

## Selection of Columns

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statistics 
import random

In [None]:
# importing the data
df = pd.read_csv('../raw_data/parties-001.csv')
merge_id = pd.read_csv('../raw_data/id_merge.csv')

In [None]:
# merging the data on the LA area

df_par = merge_id.merge(df, how='inner', on='case_id')

In [None]:
list_of_features = ['case_id', 'party_number', 'party_type', 'party_sex', 'party_age', 'vehicle_make', 'vehicle_year', 'at_fault',
                    'party_sobriety', 'party_drug_physical', 'cellphone_use', 
                   'party_number_killed', 'party_number_injured', 'movement_preceding_collision',
                    'statewide_vehicle_type', 'chp_vehicle_type_towing']

note_sure = ['financial_responsibility', 'hazardous_materials']

In [None]:
df_col = df_par[list_of_features]
df_col.shape

## Selection of Features

In [None]:
df_col.head()

In [None]:
df_col.fillna(value='unknown', inplace=True)

In [None]:
# transform unknwown age for mean age 
ages = []

for age in df_col.party_age:
    if age != 'unknown':
        ages.append(age)
        
mean_age = round(statistics.mean(ages) )

def assign_age(x):
    if x == 'unknown':
        return mean_age
    return x
    
df_col.party_age = df_col.party_age.apply(lambda x: assign_age(x))

# transform to int
df_col.party_age = df_col.party_age.apply(lambda x: int(x))

In [None]:
# pick a random sex for unknown 
def assign_sex(x):
    assignement = ['male', 'female']
    if x == 'unknown':
        return random.choice(assignement)
    return x

df_col.party_sex = df_col.party_sex.apply(lambda x: assign_sex(x))

In [None]:
# transform unknwown vehicle_year for most frequent veihcle_year
most_frequent_year = df.vehicle_year.mode()

def assign_year(x):
    if x == 'unknown':
        return most_frequent_year
    return x
    
df_col.vehicle_year = df_col.vehicle_year.apply(lambda x: assign_year(x))

# transform to int
df_col.vehicle_year = df_col.vehicle_year.apply(lambda x: int(x))

In [None]:
df_col.cellphone_use.unique()

In [None]:
# binary encoding of cellphone_use
def cellphone_use(x):
    dict_cellphone = {'3':0, 'C':0, '1':1, 'D':1, '2':1, 'B':1}
    if x == 'unknown':
        return x
    return dict_cellphone[x]

df_col.cellphone_use = df_col.cellphone_use.apply(lambda x: cellphone_use(x))

In [None]:
df_col.head()

In [None]:
df_col.movement_preceding_collision.unique()

In [None]:
# binary encoding of drug & alcohol
def sobriety(x):
    sobriety = {'A':0, 'B':1, 'C':1, 'D':1, 'G':0, 'E':1, 'F':1, 'I':1, 'H':0,}
    if x in sobriety:
        return sobriety[x]
    return x

df_col.party_sobriety = df_col.party_sobriety.apply(lambda x: sobriety(x))
df_col.party_drug_physical = df_col.party_drug_physical.apply(lambda x: sobriety(x))

In [None]:
df_col.party_drug_physical.unique()

In [None]:
# create the new column of sobriety 
def col_sobriety(x):
    if x != 'unknown':
        return x
    return 10

df_col['sobriety'] = df_col.party_drug_physical.apply(lambda x: col_sobriety(x))

In [None]:
df_col.party_sobriety = df_col.party_sobriety.apply(lambda x: col_sobriety(x))

In [None]:
df_col['sobriety'] = df_col['sobriety'].add(df_col.party_sobriety)

In [None]:
df_col['sobriety'].unique()

def encoding_sobriety(x):
    dict_of_x = {10:'unknown', 11:1, 20:'unknown', 2:1}
    if x in dict_of_x:
        return dict_of_x[x]
    return x

df_col.sobriety  = df_col.sobriety.apply(lambda x: encoding_sobriety(x))
df_col.drop(columns=['party_drug_physical', 'party_sobriety'], inplace=True)

In [None]:
df_col.statewide_vehicle_type.unique()

In [None]:
# export the brand new dataframe of parties
df_col.to_csv('../raw_data/parties-001.csv')