# Imports

In [51]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

In [52]:
def load_data() -> pd.Series:
    csv_train = pd.read_csv('accidents.csv').assign(train = 1) 
    csv_test = pd.read_csv('test.csv').assign(train = 0) 
    csv_vehicles = pd.read_csv('vehicles.csv')
    csv = pd.concat([csv_train,csv_test], sort=True)
    return csv, csv_vehicles

def nans_ctr(csv) -> pd.Series:
    return csv.isna().sum()

def unique_ctr(csv) -> pd.Series():
    unique = pd.Series()
    for col in list(csv):
        if(csv.columns.contains(col)):
            unique.at[col] = len(csv[col].unique())
    return unique

def val_types(csv) -> pd.Series():
    val_type = pd.Series()
    for col in list(csv):
        if not csv.columns.contains(col):
            continue
        if csv[col].dtype == np.float64:
            val_type.at[col] = np.float64
        elif csv[col].dtype == np.int64:
            val_type.at[col] = np.int64
        elif csv[col].dtype == np.int32:
            val_type.at[col] = np.int32
        elif csv[col].dtype == np.uint8:
            val_type.at[col] = np.uint8
        elif csv[col].dtype == object:
            val_type.at[col] = object
        elif csv[col].dtype == bool:
            val_type.at[col] = bool
        else:
            print(f"No common value type found in val_types() - {csv[col].dtype}")
    return val_type

def min_max_val(csv) -> pd.Series():
    min_val = pd.Series()
    max_val = pd.Series()
    val_type = val_types(csv)
    for col in list(csv):
        if val_type[col] != object:
            min_val.at[col] = csv[col].min()
            max_val.at[col] = csv[col].max()
        else:    
            min_val.at[col] = None
            max_val.at[col] = None
    return min_val, max_val        
            
def get_stats(csv):
    nans = nans_ctr(csv)
    unique = unique_ctr(csv)
    val_type = val_types(csv)
    min_val, max_val = min_max_val(csv)
    result = pd.DataFrame({ 'nans': nans, 'unique': unique, 'val_type': val_type, 'min_val': min_val, 'max_val': max_val}) 
    return result
    
def bool_to_integer(csv) -> pd.DataFrame():
    for col in csv.columns:
        if csv[col].dtype == bool:
            csv[col] = csv[col].astype(int)
    return csv
    
def standarize_numerical_values(csv):
    for col in csv.columns:
        if col == 'train':
            continue
        if csv[col].dtype == np.float64:
            data = csv[col]
            std = data.std()
            data = data[(data < data.quantile(0.99)) & (data > data.quantile(0.01))]
            mean = data.mean()
            csv[col] = (csv[col] - mean)/std
#             _ = plt.hist(csv[col], bins='auto', alpha = 0.5)
#             plt.yscale('log')
#             plt.title(f"Distr in {col} column")
#             plt.show()
    return csv

def check_rows(csv):
    for row in range(len(csv)):
        print(row, csv.iloc[row].isna().sum())
    return csv

def distribution_in_columns(csv):
    for col in list(csv):
        print(csv[col].value_counts())
    return csv
        
def plot_dist_y(csv):
    plt.pie([len(csv[csv['Alignment'] == 'good']), len(csv[csv['Alignment'] == 'bad']), 
             len(csv[csv['Alignment'] == 'neutral'])], labels = ['good', 'bad', 'neutral'])
    plt.show()
    return csv
    
def factorize(csv, col_name, verbose=False) -> pd.DataFrame():
    if csv[col_name].dtype == object:
        dummy = pd.get_dummies(csv[col_name])
        dummy.columns = [col_name+ " "+x for x in dummy.columns]
#         dummy = dummy.drop([dummy.columns[-1]], axis=1)
        csv = csv.drop(col_name, axis=1)
        if verbose:
            display(dummy.head())
        csv = pd.concat([csv, dummy], axis=1)
    else:
        assert("factorize non object column")
    return csv

def check_corelation(csv, col_1, col_2):
    df_corr = pd.DataFrame()
    df_corr[col_1] = csv[col_1].astype('category').cat.codes
    df_corr[col_2]=csv[col_2]
    df_corr = df_corr.dropna()
    print(df_corr.corr())

In [58]:
csv, csv_vehicles = load_data()
# get_stats(csv)

In [59]:
csv = factorize(csv, "1st_road_class")
csv = factorize(csv, "2nd_road_class")
csv = factorize(csv, "carriageway_hazards")
csv = factorize(csv, "junction_control")
csv = factorize(csv, "junction_detail")

csv = factorize(csv, "pedestrian_crossing-human_control")
csv = factorize(csv, "pedestrian_crossing-physical_facilities")
csv = factorize(csv, "road_surface_conditions")
csv = factorize(csv, "road_type")
csv = factorize(csv, "special_conditions_at_site")
csv = factorize(csv, "urban_or_rural_area")

In [60]:
csv["light_conditions"] = csv["light_conditions"].replace('Daylight', 1)
csv["light_conditions"] = csv["light_conditions"].replace('Darkness - lights lit', 0.75)
csv["light_conditions"] = csv["light_conditions"].replace('Darkness - lighting unknown', 0.5)
csv["light_conditions"] = csv["light_conditions"].replace('-1', 0.5)
csv["light_conditions"] = csv["light_conditions"].replace('Darkness - lights unlit', 0.25)
csv["light_conditions"] = csv["light_conditions"].replace('Darkness - no lighting', 0.0)

In [61]:
check_corelation(csv, "police_force", 'target')    
csv = csv.drop("police_force", axis=1)
csv = csv.drop("lsoa_of_accident_location", axis=1)
csv = csv.drop("local_authority_highway", axis=1)
csv = csv.drop("local_authority_district", axis=1)

csv = csv.drop("1st_road_number", axis=1)
csv = csv.drop("2nd_road_number", axis=1)


csv = csv.drop("location_easting_osgr", axis=1)
csv = csv.drop("location_northing_osgr", axis=1)

# droping accident_id
csv = csv.drop("accident_id", axis=1)

              police_force    target
police_force      1.000000 -0.014641
target           -0.014641  1.000000


In [62]:
csv["year 2017"] = pd.DatetimeIndex(csv['date']).year-2016
csv["month"] = pd.DatetimeIndex(csv['date']).month
csv = factorize(csv, "month")
csv["week_day"] = pd.DatetimeIndex(csv['date']).day_name()
csv = factorize(csv, "week_day")
csv = csv.drop("date", axis=1)

In [63]:
time = pd.DatetimeIndex(csv["time"])
time = time.hour * 60 + time.minute
time = pd.DataFrame(time)
time[time>=1080] = -3
time[time>=720] = -2
time[time>=360] = -1
time[time>=0] = 0
time = - time
time = pd.DataFrame(time)
time = time.replace(3, "evening")
time = time.replace(2, "midday")
time = time.replace(1, "morning")
time = time.replace(0, "night")
csv["day_time"] = time
csv = factorize(csv, "day_time")

In [64]:
def standarize_numerical_values(csv, col):
    data = csv[col]
    std = data.std()
    data = data[(data < data.quantile(0.99)) & (data > data.quantile(0.01))]
    mean = data.mean()
    csv[col] = (csv[col] - mean)/std
    return csv
    
def plot(csv, col):
    _ = plt.hist(csv[col], bins='auto', alpha = 0.5)
    plt.yscale('log')
    plt.title(f"Distribution in \"{col}\" column")
    plt.show()

In [65]:
csv["colonies"] = 0
csv["colonies"][csv["latitude"]<40] = 1
csv = standarize_numerical_values(csv, "latitude")
csv = standarize_numerical_values(csv, "longitude")
# csv = standarize_numerical_values(csv, "number_of_casualties")
# csv = standarize_numerical_values(csv, "number_of_vehicles")
num_veh = csv["number_of_vehicles"]
csv["number_of_vehicles"] = (num_veh-num_veh.min())/(num_veh.max()-num_veh.min())
num_veh = csv["number_of_casualties"]
csv["number_of_casualties"] = (num_veh-num_veh.min())/(num_veh.max()-num_veh.min())


In [66]:
csv["speed_limit"][csv["speed_limit"]==300] = 1
csv["speed_limit"][csv["speed_limit"]>20] = 0.5
csv["speed_limit"][csv["speed_limit"]==20] = 0.0
csv = standarize_numerical_values(csv, "weather_conditions")

In [69]:
csv["latitude"].mean()

-0.015297356551833117

In [None]:
csv["road_type"]

In [67]:
get_stats(csv)

Unnamed: 0,nans,unique,val_type,min_val,max_val
date,0,731,<class 'object'>,,
latitude,0,245262,<class 'numpy.float64'>,-11.9008,4.57971
light_conditions,0,5,<class 'numpy.float64'>,0,1
longitude,0,249980,<class 'numpy.float64'>,-6.93904,2.04247
number_of_casualties,0,26,<class 'numpy.float64'>,0,1
number_of_vehicles,0,17,<class 'numpy.float64'>,0,1
road_type,0,6,<class 'numpy.int64'>,1,6
speed_limit,0,3,<class 'numpy.float64'>,0,1
target,129950,3,<class 'numpy.float64'>,0,1
time,0,1439,<class 'object'>,,
