In [28]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import geopandas as gpd

In [29]:
df = pd.read_csv('OdinData/odin-2018-2019-v1.0.0.csv')

In [4]:
df.set_index('verplid', inplace = True)
df.drop(['wopc', 'wogem', 'sted', 'gemgr', 'prov', 'opid'], axis = 1, inplace = True)

In [5]:
# PCs = gpd.read_file('../PublicGeoJsons/AmsPCs.json')[['Postcode4', 'geometry']]
# merged = df.merge(PCs, left_on = 'vertpc', right_on = 'Postcode4')
# df = gpd.GeoDataFrame(
#     merged,
#     geometry= merged.geometry,
#     crs='EPSG:4326'
# )

# merged2 = df[['aankpc']].merge(PCs, left_on = 'aankpc', right_on = 'Postcode4')
# aank = gpd.GeoDataFrame(
#     merged2,
#     geometry= merged2.geometry,
#     crs='EPSG:4326'
# )

# df['start_lon'] = aank.geometry.centroid.x
# df['start_lat'] = aank.geometry.centroid.y
# df['end_lon'] = df.geometry.centroid.x
# df['end_lat'] = df.geometry.centroid.y
#
# df.drop(['aankpc', 'vertpc', 'geometry', 'Postcode4'], axis =1, inplace = True)

In [31]:
df.drop('aanktijd', axis =1, inplace = True)
inf = pd.to_datetime(df['verttijd'],format= '%H:%M' )
df['seconds'] = inf.dt.second + 60*(inf.dt.hour)

seconds_in_day = 24*60*60
# df['sin_day'] = np.sin(2*np.pi*df.weekdag/7)
# df['cos_day'] = np.cos(2*np.pi*df.weekdag/7)
df['sin_time'] = np.sin(2*np.pi*df.seconds/seconds_in_day)
df['cos_time'] = np.cos(2*np.pi*df.seconds/seconds_in_day)

# df.drop(['weekdag', 'seconds', 'verttijd'], axis = 1, inplace = True)
df.drop([ 'seconds', 'verttijd'], axis = 1, inplace = True)

In [32]:
categorical_columns = pd.read_json('OdinData/odin-col-dict.json').columns
non_categorical_columns = [x for x in df.columns if x not in categorical_columns]
non_categorical_columns

['opid',
 'wopc',
 'verplid',
 'choice_dur',
 'choice_dist',
 'bike_dur',
 'bike_dist',
 'car_dur',
 'car_dist',
 'pt_dur',
 'pt_dist',
 'walk_dur',
 'walk_dist',
 'sin_time',
 'cos_time']

In [33]:
df.isna().sum()
# df.dropna(inplace = True)
# df.fillna(0)

opid                 0
hhpers               0
hhsam                0
hhlft1               0
hhlft2               0
hhlft3               0
hhlft4               0
wopc                 0
wogem                0
sted                 0
gemgr                0
prov                 0
geslacht             0
leeftijd             0
herkomst             0
betwerk              0
onbbez               0
opleiding            0
hhgestinkg           0
oprijbewijsau        0
hhauto               0
brandstofpa1         0
brandstofepa1        0
brandstofpa2         0
brandstofepa2        0
hhefiets             0
ovstkaart            0
weekdag              0
feestdag             0
verplid              0
doel                 0
kmotiefv             0
vertpc               0
aankpc               0
khvm                 0
choice_dur           0
choice_dist          0
bike_dur          5654
bike_dist         5654
car_dur          12917
car_dist         12917
pt_dur           30014
pt_dist          30014
walk_dur   

In [34]:
df[df.pt_dist.isna()][['vertpc', 'aankpc', 'walk_dist', 'bike_dist', 'car_dist', 'pt_dist']]
# df['A'].fillna(df['B'], inplace=True)

Unnamed: 0,vertpc,aankpc,walk_dist,bike_dist,car_dist,pt_dist
0,1191,1106,7375.85,7413.49,10380.46,
1,1441,1121,7600.31,7631.54,10385.84,
2,1969,1935,9922.41,10600.68,12790.36,
3,1935,1969,9922.41,10614.76,12522.10,
4,1975,1974,4890.14,4939.30,5739.89,
...,...,...,...,...,...,...
198226,1188,1185,3949.00,3951.00,,
198232,1951,1561,,13296.00,15185.00,
198252,8043,7938,,52820.00,58293.00,
198253,7083,7091,6489.00,6489.00,7036.00,


In [35]:
df['walk_dist'] = df['walk_dist'].fillna(df['bike_dist']) #High Confidence
df['walk_dur'] = df['walk_dur'].fillna(df['bike_dur']) #High Confidence


In [36]:
df['pt_dist'] = df['pt_dist'].fillna(df['bike_dist']) #This is QUESTIONABLE
df['car_dist'] = df['car_dist'].fillna(df['bike_dist']) #Medium ocnfidence
df['car_dur'] = df['car_dur'].fillna(df['bike_dur']) #Medium ocnfidence
df['pt_dur'] = df['pt_dur'].fillna(df['bike_dur']) #This is QUESTIONABLE

df = df.dropna()


In [37]:
cat_cols = list(set(categorical_columns).intersection(set(df.columns)))

In [38]:
df[cat_cols] = df[cat_cols].astype('object')

In [39]:
df.dtypes

opid               int64
hhpers            object
hhsam             object
hhlft1            object
hhlft2            object
hhlft3            object
hhlft4            object
wopc               int64
wogem             object
sted              object
gemgr             object
prov              object
geslacht          object
leeftijd          object
herkomst          object
betwerk           object
onbbez            object
opleiding         object
hhgestinkg        object
oprijbewijsau     object
hhauto            object
brandstofpa1      object
brandstofepa1     object
brandstofpa2      object
brandstofepa2     object
hhefiets          object
ovstkaart         object
weekdag           object
feestdag          object
verplid            int64
doel              object
kmotiefv          object
vertpc            object
aankpc            object
khvm              object
choice_dur         int64
choice_dist        int64
bike_dur         float64
bike_dist        float64
car_dur          float64


In [40]:
import json

with open('OdinData/odin-col-dict.json', "r") as json_file:
    choice_dict = json.load(json_file)
df.khvm = df.khvm.astype(str)
df = df.replace({"khvm": choice_dict['khvm']})
df.weekdag = df.weekdag.astype(str)
df = df.replace({"weekdag": choice_dict['weekdag']})

In [41]:
df['khvm']

0                    Bus/tram/metro
1                             Fiets
2         Personenauto - bestuurder
3         Personenauto - bestuurder
4         Personenauto - bestuurder
                    ...            
198267                       Overig
198268                        Fiets
198269                        Fiets
198270    Personenauto - bestuurder
198271    Personenauto - bestuurder
Name: khvm, Length: 192618, dtype: object

In [44]:
pcdata = pd.read_pickle('../PostcodeInfo/PCData')
pcdata.index.names = ['pc4']
pcdata = pcdata.reset_index()
#
df.aankpc = df.aankpc.astype(str)
df.vertpc = df.vertpc.astype(str)
# df = df.merge(pcdata.add_prefix('aank '), left_on='aankpc', right_on='aank pc4')
# df = df.merge(pcdata.add_prefix('vert '), left_on='vertpc', right_on='vert pc4')  #
# df = df.drop(['aank pc4', 'vert pc4'], axis=1)

In [50]:
df = df.merge(pcdata.add_prefix('aank '), left_on='aankpc', right_on='aank pc4')
df = df.merge(pcdata.add_prefix('vert '), left_on='vertpc', right_on='vert pc4')
df = df.drop(['aank pc4', 'vert pc4'], axis=1)
df.head()

Unnamed: 0,opid,hhpers,hhsam,hhlft1,hhlft2,hhlft3,hhlft4,wopc,wogem,sted,...,vert 2005 tot 2015,vert 2015 en later,vert Meergezins,vert Koopwoning,vert Huurwoning,vert Huurcoporatie,vert Niet bewoond,vert WOZ-waarde\nwoning,"vert Personen met WW, Bijstand en/of AO uitkering\nBeneden AOW-leeftijd",vert density
0,56026580953,3,6,0,0,2,1,1191,437,3,...,0.043935,0.030398,0.087046,0.001355,0.000323,0.118997,0.006961,0.000317,0.03926,0.078487
1,55902592361,4,3,1,1,0,2,1191,437,3,...,0.043935,0.030398,0.087046,0.001355,0.000323,0.118997,0.006961,0.000317,0.03926,0.078487
2,56175225767,5,3,0,1,2,2,1191,437,3,...,0.043935,0.030398,0.087046,0.001355,0.000323,0.118997,0.006961,0.000317,0.03926,0.078487
3,55995615859,3,3,0,0,0,3,1191,437,3,...,0.043935,0.030398,0.087046,0.001355,0.000323,0.118997,0.006961,0.000317,0.03926,0.078487
4,59095677859,4,3,2,0,0,2,1191,437,3,...,0.043935,0.030398,0.087046,0.001355,0.000323,0.118997,0.006961,0.000317,0.03926,0.078487


In [51]:
df.to_pickle('Odin2019All')

In [65]:
df = pd.read_pickle('Odin2019All')

In [66]:
#2000-2037: Haarlem
DH = list(range(2490, 2599+ 1))
Rdam = list(range(3011,3089+ 1))
Adam = [str(x) for x in list(range(1011, 1109 + 1)) + list(range(1381, 1384 +1))]
Eind = list(range(5611,5658+1)) + list(range(5660,5667+1))#Geldrop
Tilb = list(range(5011,5049+1))
Breda = list(range(4800-4839+1))
HaaBloZan = list(range(2000, 2061+1))
Ensc = list(range(7511, 7548+1))
Urban = DH + Adam + Rdam + Eind + Tilb + Breda + HaaBloZan+ Ensc
Urban = [str(PC) for PC in Urban]

In [67]:
df = df[(df['aankpc'].isin(Urban)) | (df['vertpc'].isin(Urban))]
df.head()
df.to_pickle('Odin2019UrbanOr')
len(df)

49894

In [68]:
df = df[(df['aankpc'].isin(Urban)) & (df['vertpc'].isin(Urban))]
df.head()
df.to_pickle('Odin2019UrbanAnd')
len(df)

26229

In [69]:
df = df[(df['aankpc'].isin(Adam)) | (df['vertpc'].isin(Adam))]
df.to_pickle('Odin2019Ams')
df.head()
len(df)

7926

In [26]:
# Encode categorical features as numerical labels
le = LabelEncoder()
for col in cat_cols:
    df[col] = le.fit_transform(df[col])

In [27]:
# Split the data into train and test sets
X = df.drop('khvm', axis=1)  # Drop the target variable from the features
y = df['khvm']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost classifier
clf = xgb.XGBClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Test the classifier
y_pred = clf.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

ValueError: With n_samples=0, test_size=0.2 and train_size=None, the resulting train set will be empty. Adjust any of the aforementioned parameters.