In [27]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import PoissonRegressor
import pickle

In [28]:
def prepprocess_data(data):
    columns = data.columns
    #keep only 1-5 first columns
    data.drop(columns[5:len(columns)+1], axis=1,inplace=True)
    #format month so that the column has only from 1-12
    data.loc[:,('MONAT')] = data.loc[:,('MONAT')].apply(lambda x: x[-2:])
    #remove row with Month feature cintaining 'summe'
    data = data.loc[data.loc[:,('MONAT')] != 'me']
    data.loc[:,('MONAT')] = data.loc[:,('MONAT')].apply(lambda x: int(x))
    #simplifying year feature
    data.loc[:,('JAHR')] = data.loc[:,('JAHR')].apply(lambda x: x-2000)
    #drop rows in year 2021
    data = data.loc[data.loc[:,('JAHR')] != 21]
    return data

In [29]:
path ='traffic_accidents.csv'
data = pd.read_csv(path)

In [30]:
#visualize original data
data.head()

Unnamed: 0,MONATSZAHL,AUSPRAEGUNG,JAHR,MONAT,WERT,VORJAHRESWERT,VERAEND_VORMONAT_PROZENT,VERAEND_VORJAHRESMONAT_PROZENT,ZWOELF_MONATE_MITTELWERT
0,Alkoholunfälle,insgesamt,2021,202101,,28.0,,,
1,Alkoholunfälle,insgesamt,2021,202102,,40.0,,,
2,Alkoholunfälle,insgesamt,2021,202103,,27.0,,,
3,Alkoholunfälle,insgesamt,2021,202104,,26.0,,,
4,Alkoholunfälle,insgesamt,2021,202105,,40.0,,,


In [31]:
prep_data = prepprocess_data(data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)


In [32]:
#visualize preprocess data
prep_data.head()

Unnamed: 0,MONATSZAHL,AUSPRAEGUNG,JAHR,MONAT,WERT
13,Alkoholunfälle,insgesamt,20,1,28.0
14,Alkoholunfälle,insgesamt,20,2,40.0
15,Alkoholunfälle,insgesamt,20,3,27.0
16,Alkoholunfälle,insgesamt,20,4,26.0
17,Alkoholunfälle,insgesamt,20,5,40.0


In [33]:
#check for empty values
prep_data.isnull().values.any()

False

In [34]:
#getting target column
y = prep_data.iloc[:,4].to_numpy()

In [35]:
#encode categorical values;Category and Type
ohe = preprocessing.OneHotEncoder()
data_cat = prep_data.iloc[:,0:2]
ohe.fit(data_cat)
X_cat = ohe.transform(data_cat).toarray()
#convert month and year to array
X_my = prep_data.iloc[:,2:4].to_numpy()

In [36]:
#combine all features together
X = np.concatenate([X_cat,X_my],axis = 1)

In [37]:
#split train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [38]:
#fit the X_train with y_train with Poisson Regressor
model = PoissonRegressor()
model.fit(X_train,y_train)

PoissonRegressor()

In [39]:
#checking train score
model.score(X_train, y_train)

0.9726878296670647

In [40]:
#checking test score
model.score(X_test, y_test)

0.978565323103127

In [41]:
np.floor(model.predict(X_test))

array([ 822.,  861.,  453.,   99., 3349., 3423., 3807.,  828.,  445.,
        474.,   73.,  467.,  771.,    8.,  790.,  411.,    9.,    9.,
        471.,   65.,  105.,    8.,   77., 3855.,  390.,  418.,  409.,
        767.,  102.,  440.,  391.,   66., 3283., 3680.,  493.,   70.,
         70.,  393.,  457.,  834.,  872.,    9., 3590.,  785.,  413.,
       3444.,  450.,  433.,  496.,  414.,  860.,  472.,  396.,    8.,
         69.,  104.,  389.,  503.,  450., 3663.,  869.,    9.,  839.,
       3308.,  766.,    9.,  843.,  437., 3675.,    9.,   70.,    9.,
        840., 3370.,    9.,    9.,  441.,    9.,   66.,    9., 3375.,
       3412.,  480.,  109.,  105.,   10.,   71.,  107.,  768.,   68.,
        501.,  439.,  452.,  114., 3401.,   66.,   65., 3423.,  485.,
        420.,  450., 3720.,  809.,  502.,   68., 3428.,  495.,  789.,
        464., 3298.,  412.,   66.,    8., 3417.,  825.,    9.,  103.,
        427.,  499.,  443.,  501.,  450.,    9.,  417.,    9.,  468.,
        452.,  481.,

In [42]:
#save the model
filename = 'model.pkl'
pickle.dump(model, open(filename, 'wb'))

In [43]:
#save encoder
pickle.dump(ohe, open('encoder.pickle', 'wb'))