# Imports

In [68]:
import numpy as np
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
# import warnings
# warnings.filterwarnings("ignore")

In [69]:
def load_data() -> pd.Series:
    csv_train = pd.read_csv('accidents.csv').assign(train = 1) 
    csv_test = pd.read_csv('test.csv').assign(train = 0) 
    csv_vehicles = pd.read_csv('vehicles.csv')
    csv = pd.concat([csv_train,csv_test], sort=True)
    return csv, csv_vehicles

def nans_ctr(csv) -> pd.Series:
    return csv.isna().sum()

def unique_ctr(csv) -> pd.Series():
    unique = pd.Series()
    for col in list(csv):
        if(csv.columns.contains(col)):
            unique.at[col] = len(csv[col].unique())
    return unique

def val_types(csv) -> pd.Series():
    val_type = pd.Series()
    for col in list(csv):
        if not csv.columns.contains(col):
            continue
        if csv[col].dtype == np.float64:
            val_type.at[col] = np.float64
        elif csv[col].dtype == np.int64:
            val_type.at[col] = np.int64
        elif csv[col].dtype == np.int32:
            val_type.at[col] = np.int32
        elif csv[col].dtype == np.uint8:
            val_type.at[col] = np.uint8
        elif csv[col].dtype == object:
            val_type.at[col] = object
        elif csv[col].dtype == bool:
            val_type.at[col] = bool
        else:
            print(f"No common value type found in val_types() - {csv[col].dtype}")
    return val_type

def min_max_val(csv) -> pd.Series():
    min_val = pd.Series()
    max_val = pd.Series()
    val_type = val_types(csv)
    for col in list(csv):
        if val_type[col] != object:
            min_val.at[col] = csv[col].min()
            max_val.at[col] = csv[col].max()
        else:    
            min_val.at[col] = None
            max_val.at[col] = None
    return min_val, max_val        
            
def get_stats(csv):
    nans = nans_ctr(csv)
    unique = unique_ctr(csv)
    val_type = val_types(csv)
    min_val, max_val = min_max_val(csv)
    result = pd.DataFrame({ 'nans': nans, 'unique': unique, 'val_type': val_type, 'min_val': min_val, 'max_val': max_val}) 
    return result
    
def bool_to_integer(csv) -> pd.DataFrame():
    for col in csv.columns:
        if csv[col].dtype == bool:
            csv[col] = csv[col].astype(int)
    return csv
    
def standarize_numerical_values(csv):
    for col in csv.columns:
        if col == 'train':
            continue
        if csv[col].dtype == np.float64:
            data = csv[col]
            std = data.std()
            data = data[(data < data.quantile(0.99)) & (data > data.quantile(0.01))]
            mean = data.mean()
            csv[col] = (csv[col] - mean)/std
#             _ = plt.hist(csv[col], bins='auto', alpha = 0.5)
#             plt.yscale('log')
#             plt.title(f"Distr in {col} column")
#             plt.show()
    return csv

def check_rows(csv):
    for row in range(len(csv)):
        print(row, csv.iloc[row].isna().sum())
    return csv

def distribution_in_columns(csv):
    for col in list(csv):
        print(csv[col].value_counts())
    return csv
        
def plot_dist_y(csv):
    plt.pie([len(csv[csv['Alignment'] == 'good']), len(csv[csv['Alignment'] == 'bad']), 
             len(csv[csv['Alignment'] == 'neutral'])], labels = ['good', 'bad', 'neutral'])
    plt.show()
    return csv
    
def factorize(col) -> pd.DataFrame():
    if col.dtype == object:
        dummy = pd.get_dummies(col)
        dummy.columns = [col.name+ " "+ x for x in dummy.columns]
#         dummy = dummy.drop([dummy.columns[-1]], axis=1)
        csv = csv.drop(col, axis=1)
        csv = pd.concat([csv, dummy], axis=1)
    else:
        assert("factorize non object column")
    return csv

In [70]:
csv, csv_vehicles = load_data()
stats = get_stats(csv)

In [19]:
stats

Unnamed: 0,nans,unique,val_type,min_val,max_val
1st_road_class,0,6,<class 'object'>,,
1st_road_number,0,4579,<class 'numpy.int64'>,0.0,9918.0
2nd_road_class,0,7,<class 'object'>,,
2nd_road_number,0,4197,<class 'numpy.float64'>,-1.0,9999.0
accident_id,0,266525,<class 'numpy.int64'>,300000.0,599998.0
carriageway_hazards,0,7,<class 'object'>,,
date,0,731,<class 'object'>,,
junction_control,0,6,<class 'object'>,,
junction_detail,0,10,<class 'object'>,,
latitude,0,245262,<class 'numpy.float64'>,31.48,60.4902


In [50]:
csv["1st_road_class"].unique()

array(['A', 'C', 'B', 'Unclassified', 'Motorway', 'A(M)'], dtype=object)

In [78]:
def factorize(csv, col_name) -> pd.DataFrame():
    if csv[col_name].dtype == object:
        dummy = pd.get_dummies(csv[col_name])
        dummy.columns = [col_name+ " "+x for x in dummy.columns]
#         dummy = dummy.drop([dummy.columns[-1]], axis=1)
        csv = csv.drop(col_name, axis=1)
        display(dummy.head())
        csv = pd.concat([csv, dummy], axis=1)
    else:
        assert("factorize non object column")
    return csv

In [79]:
_ = factorize(csv, "1st_road_class")

# csv["1st_road_class"].name

Unnamed: 0,1st_road_class A,1st_road_class A(M),1st_road_class B,1st_road_class C,1st_road_class Motorway,1st_road_class Unclassified
0,1,0,0,0,0,0
1,1,0,0,0,0,0
2,1,0,0,0,0,0
3,1,0,0,0,0,0
4,1,0,0,0,0,0
