In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from joblib import dump
from sklearn.metrics import balanced_accuracy_score
import xgboost as xgb
import holidays
import h3
import numpy as np

target_field = "accident_type"


def get_holiday(row):
    date = row.date.strftime("%d-%m-%Y")
    if row.country == "France":
        return date in fr_holidays
    return date in uk_holidays


uk_holidays = holidays.UnitedKingdom()
fr_holidays = holidays.France()


def fix_coord_dot(x):
    x = str(x)
    if x.find(",") != -1:
        x = x.replace(",", ".")
    return float(x)


def fix_coord(row):
    latitude = row.latitude
    longitude = row.longitude
    while latitude > 90 or latitude < -90 or longitude > 180 or longitude < -180:
        latitude = latitude / 10
        longitude = longitude / 10
    row["latitude"] = latitude
    row["longitude"] = longitude
    return row


def get_h3_cluster(row, resolution):
    return h3.geo_to_h3(float(row.latitude), float(row.longitude), resolution)


In [2]:
# Load the train dataset
training_data = pd.read_csv("train.csv")
original_validation_data = pd.read_csv("submit.csv").set_index('trustii_id')
validation_data = original_validation_data.copy()

  exec(code_obj, self.user_global_ns, self.user_ns)
  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
columns_to_drop = [
    'accident_id',
    'year',
    'longitude',
    'latitude',
    'department',
    'address_nbr',
    'month',
    'day',
    'date'
]

In [4]:
def preprocessing(df, remove_rows=False):
    # Data Cleaning, pre-processing, feature engineering ...

    # not used
    # df["is_holiday"] = df.apply(get_holiday, axis=1)
    # df["night"] = df.light_condition.apply(lambda x: x in (3,4,5))
    # df['weekend'] = df['weekday'].apply(lambda x: x==6 or x==7)

    # In the column country, replace the value 'A' with 'France'
    df["country"].replace("A", "France", inplace=True)
    # In the column country, replace the value 'B' with 'UK'
    df["country"].replace("B", "UK", inplace=True)

    df["date"] = pd.to_datetime(dict(year=df["year"], month=df["month"], day=df["day"]))
    df["weekday"] = df["date"].dt.weekday

    df["speed_limitation"] = pd.to_numeric(df["speed_limitation"], errors="coerce")

    # In the lines where country is UK, multiply the speed_limitation by 1.6
    df["speed_limitation"][df["country"] == "UK"] = (
        df["speed_limitation"][df["country"] == "UK"] * 1.6
    )

    df["speed_limitation"].fillna(df["speed_limitation"].mean(), inplace=True)
    df["speed_limitation"] = df["speed_limitation"].astype(int)

    df["vehicule_motor_type"] = pd.to_numeric(
        df["vehicule_motor_type"], errors="coerce"
    )
    df["vehicule_motor_type"].fillna(
        df["vehicule_motor_type"].value_counts().index[0], inplace=True
    )
    df["vehicule_motor_type"] = df["vehicule_motor_type"].astype(int)

    # DELETE THIS PART COULD IMPROVE METRICS
    # ======================================
    # cleaning coordinates
    if remove_rows:
        df = df.query(
            "longitude == longitude and latitude == latitude and (latitude != 0 or longitude != 0)"
        )
    else:
        df["longitude"] = df["longitude"].replace(0, np.nan)
        df["latitude"] = df["latitude"].replace(0, np.nan)
        df = df.fillna(df.mode().iloc[0])
    extract = df[["longitude", "latitude"]]
    extract["latitude"] = extract.latitude.apply(fix_coord_dot)
    extract["longitude"] = extract.longitude.apply(fix_coord_dot)
    extract = extract.apply(fix_coord, axis=1)

    # create cluster 1-5km around accident
    extract["h3_7"] = extract.apply(get_h3_cluster, axis=1, args=(7,))
    extract["h3_8"] = extract.apply(get_h3_cluster, axis=1, args=(8,))
    df = df.merge(
        extract.drop(columns=["longitude", "latitude"]),
        left_index=True,
        right_index=True,
        how="left",
    )
    df["h3_7"] = df["h3_7"].astype("category")
    df["h3_7"] = df.h3_7.cat.codes
    df["h3_7"] = df["h3_7"].astype("int")
    df["h3_8"] = df["h3_8"].astype("category")
    df["h3_8"] = df.h3_8.cat.codes
    df["h3_8"] = df["h3_8"].astype("int")
    # ======================================    

    # One hot encoding
    df = pd.get_dummies(df, columns=["country", "person_type", "person_sex", "weekday"])

    # if exists, drop column person_sex_-1.0
    try:
        df = df.drop(["person_sex_-1.0"], axis=1)
    except:
        pass
    try:
        df = df.drop(["person_sex_9.0"], axis=1)
    except:
        pass
    try:
        df = df.drop(["person_type_4.0"], axis=1)
    except:
        pass

    df = df.drop(columns_to_drop, axis=1)

    return df


In [5]:
processed_training_df = preprocessing(training_data, remove_rows=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["speed_limitation"][df["country"] == "UK"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extract["latitude"] = extract.latitude.apply(fix_coord_dot)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extract["longitude"] = extract.longitude.apply(fix_coord_dot)


In [9]:
# Split into train and test
X = processed_training_df.drop(target_field, axis=1)
y = processed_training_df[target_field].values
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    stratify=y, 
                                                    test_size=0.25)

# Create xgboost model with scale_pos_weight= cause unbalanced target
xgb_model = xgb.XGBClassifier(
    random_state=125,
    scale_pos_weight=4,
)

# Fit the model
xgb_model.fit(X_train, y_train)

# Predict the test set
y_pred = xgb_model.predict(X_test)





In [10]:
# Evaluate the model
print(balanced_accuracy_score(y_test, y_pred))

0.6791701041141425


In [11]:
# submit
processed_validation_df = preprocessing(validation_data)
y_pred_valid = xgb_model.predict(processed_validation_df)
original_validation_data[target_field] = y_pred_valid # Make sure to keep target name here
original_validation_data.to_csv('submission.csv', index=True, encoding='UTF-8')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["speed_limitation"][df["country"] == "UK"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extract["latitude"] = extract.latitude.apply(fix_coord_dot)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  extract["longitude"] = extract.longitude.apply(fix_coord_dot)


In [12]:
original_validation_data.accident_type.value_counts(normalize=True)

2    0.555944
1    0.444056
Name: accident_type, dtype: float64

In [22]:
original_validation_data

Unnamed: 0_level_0,accident_id,year,longitude,latitude,department,road_category,address_nbr,traffic_type,speed_limitation,intersection,pedestrian_localisation,light_condition,weather_conditions,road_condition,obstacle_hit_type,person_type,person_sex,vehicule_type,principal_maneuver_before_accident,vehicule_motor_type,day,month,country,accident_type
trustii_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
1019363,x0x000000001,2020.0,24384100,487053500,91,4.0,HENRI BARBUSSE (AVENUE),2.0,50.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,1.0,7.0,15.0,1.0,7.0,3.0,A,1
1019364,x0x000000001,2020.0,24384100,487053500,91,4.0,HENRI BARBUSSE (AVENUE),2.0,50.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,1.0,33.0,2.0,1.0,7.0,3.0,A,2
1019365,x0x000000001,2020.0,24384100,487053500,91,4.0,HENRI BARBUSSE (AVENUE),2.0,50.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,1.0,7.0,15.0,1.0,7.0,3.0,A,1
1019366,x0x000000001,2020.0,24384100,487053500,91,4.0,HENRI BARBUSSE (AVENUE),2.0,50.0,3.0,-1.0,1.0,1.0,1.0,0.0,1.0,1.0,33.0,2.0,1.0,7.0,3.0,A,2
1019367,x0x00000000x,2020.0,24100000,486900000,91,4.0,MOUSSEAUX(CHEMIN),2.0,50.0,9.0,-1.0,2.0,7.0,1.0,0.0,1.0,1.0,7.0,26.0,1.0,7.0,3.0,A,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2631697,x0x09910x7064,2020.0,-2.92632,56.473539,99,4.0,959,6.0,30.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,,,,8.0,12.0,B,2
2631698,x0x09910x9573,2020.0,-4.267565,55.802353,99,6.0,0,9.0,30.0,3.0,0.0,1.0,1.0,1.0,0.0,3.0,2.0,,,,13.0,11.0,B,2
2631699,x0x0991030x97,2020.0,-2.271903,57.186317,99,4.0,979,6.0,60.0,8.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,,,,15.0,4.0,B,2
2631700,x0x0991030900,2020.0,-3.968753,55.95094,99,6.0,0,6.0,30.0,3.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,,,,15.0,12.0,B,2


In [60]:
#Dump the mode
dump(xgb_model, "xgb_model_v2.joblib")

['xgb_model_v2.joblib']