In [1]:
import sklearn as sk
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing

In [2]:
'''
Loading the dataset
'''

dt = pd.read_csv('training_data.csv')
test = pd.read_csv('test_data.csv')

In [None]:
print(dt.shape)

In [None]:
corr_matrix = dt.corr()
f, ax = plt.subplots(figsize=(12,10))
sns.heatmap(corr_matrix,vmax=1,vmin=-1,square=True,annot=True)

In [None]:
sns.pairplot(dt)

In [None]:
dt.info()
dt.head()

# Feature Engineering
Since we have a lot of categorical features,we need to do feature engineering to make the models understand our dataset.

The first feature we are going to focus is the "affected_roads". This feature lists all the roads that were affected, but if we look closely we can see that most of the affected roads are repetead, and since knowing the name of the road doesn't give us much information, we are going to transform this feature to a numerical by counting the distinct number of roads that were affected.

In [None]:
test.isna().sum()

In [3]:
from numpy import NaN


#dt.dropna(inplace=True)
dt.replace(np.NaN,0,inplace=True)

test.replace(np.NaN,0,inplace=True)

In [None]:
test[test['affected_roads']==0]

In [None]:
dt.isnull().sum()

In [4]:
def split_roads(x):
    if not isinstance(x['affected_roads'],int):
        return len(set(filter(None,x['affected_roads'].split(","))))
    else:
        return x['affected_roads']


In [5]:
dt['affected_roads_num'] = dt.apply(split_roads,axis=1)
test['affected_roads_num'] = test.apply(split_roads,axis=1)

In [None]:
dt['avg_precipitation'].nunique()

In [6]:
dt.drop('avg_precipitation', axis=1,inplace=True)
test.drop('avg_precipitation', axis=1,inplace=True)

In [7]:
dt.drop('affected_roads',axis=1,inplace=True)
test.drop('affected_roads',axis=1,inplace=True)

The next feature we are going to focus is the city name. Let's check what are the possible city names.

In [None]:
dt['city_name'].nunique()

Since it only has one possible value we can drop this column because it does not add much value to our model.

In [8]:
dt.drop('city_name',axis=1,inplace=True)
test.drop('city_name',axis=1,inplace=True)

Let's evaluate now the magnitude of delay feature.

In [None]:
dt['magnitude_of_delay'].value_counts()

We have 3 possible values but the biggest value present is Undefined, this can cause bias to our model.


In [9]:
from sklearn.preprocessing import OrdinalEncoder
lb = OrdinalEncoder(categories = [['UNDEFINED','MODERATE','MAJOR']])
dt['magnitude_of_delay'] = lb.fit_transform(dt[['magnitude_of_delay']])
test['magnitude_of_delay'] = lb.fit_transform(test[['magnitude_of_delay']])


The next feature we are going to transform is the date feature.Since this is labeled as categorical, we need to transform it to the datetime format.


In [10]:
dt['record_date'] = pd.to_datetime(dt['record_date'],format='%Y-%m-%d %H:%M')
test['record_date'] = pd.to_datetime(test['record_date'],format='%Y-%m-%d %H:%M')


In [11]:
from sklearn.preprocessing import OrdinalEncoder

lb_make=OrdinalEncoder(categories = [['DARK','LOW_LIGHT','LIGHT']])
dt['luminosity'] = lb_make.fit_transform(dt[['luminosity']])
test['luminosity'] = lb_make.fit_transform(test[['luminosity']])


In [None]:
dt['avg_rain'].value_counts()

In [12]:
avg_rain_lb = OrdinalEncoder(categories=[['Sem Chuva','chuva fraca','chuva moderada','chuva forte']])
dt['avg_rain'] = avg_rain_lb.fit_transform(dt[['avg_rain']])
test['avg_rain'] = avg_rain_lb.fit_transform(test[['avg_rain']])

In [13]:
target = OrdinalEncoder(categories=[['None','Low','Medium','High','Very_High']])
dt['incidents'] = target.fit_transform(dt[['incidents']])


In [None]:
from sklearn.preprocessing import LabelEncoder
lb_incidents = LabelEncoder()
dt['incidents'] = lb_incidents.fit_transform(dt['incidents'])
dt.head()

In [None]:
from sklearn.preprocessing import OneHotEncoder
lb = OneHotEncoder()
encoded_train = lb.fit_transform(dt[['magnitude_of_delay']])
encoded_test = lb.fit_transform(test[['magnitude_of_delay']])
dt[lb.categories_[0]] = encoded_train.toarray()
test[lb.categories_[0]] = encoded_test.toarray()

dt.drop('magnitude_of_delay',axis=1,inplace=True)
test.drop('magnitude_of_delay',axis=1,inplace=True)


In [14]:
dt['week_day'] = dt['record_date'].dt.weekday
dt['month'] = dt['record_date'].dt.month
dt['hour'] = dt['record_date'].dt.hour
dt['day'] = dt['record_date'].dt.day


test['week_day'] = test['record_date'].dt.weekday
test['month'] = test['record_date'].dt.month
test['hour'] = test['record_date'].dt.hour
test['day'] = test['record_date'].dt.day


In [15]:
dt.drop('record_date',axis=1,inplace=True)
test.drop('record_date',axis=1,inplace=True)

In [16]:
X_train = dt.drop(['incidents'],axis=1)
y_train = dt['incidents'].to_frame()


## SVC


In [None]:
from sklearn.svm import SVC
model = SVC(random_state=2022,probability=False)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier(random_state=2022)
#model.fit(X_train,y_train)

In [None]:
predicts = model.predict(test)
test['incidents'] = predicts
test.head()

In [None]:
params_grid = {'C': [0.1, 1, 10, 100, 1000],'gamma': [1, 0.1, 0.01, 0.001, 0.0001], 'kernel' : ['rbf']}

In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid
svm_model = GridSearchCV(model,params_grid,cv=5,n_jobs=-1,refit=True,verbose=3)


In [None]:
svm_model.fit(X_train,y_train.values.ravel())

In [None]:
svm_model.best_score_

## Random Forest

In [None]:
param_grid = { 
    'n_estimators': [1600,1610],
    'max_features': ['sqrt'],
    'max_depth' : range(15,18),
    'criterion' :['entropy'],
    "min_samples_split": [2,3],
    "min_samples_leaf": [1,2],
    "class_weight": ["balanced_subsample"]
}

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

random_model = RandomForestClassifier(random_state=4000)

randomForest = GridSearchCV(random_model, param_grid = param_grid, n_jobs = -1, cv = 10,verbose=3, scoring='accuracy')
randomForest.fit(X_train,y_train.values.ravel())

In [None]:
randomForest.best_score_
randomForest.best_params_


In [17]:
from sklearn.ensemble import RandomForestClassifier
bestModel = RandomForestClassifier(random_state=4000,class_weight='balanced_subsample', criterion='entropy',max_depth=15,min_samples_leaf=1,min_samples_split=2,n_estimators=1610,max_features='sqrt')
bestModel.fit(X_train,y_train.values.ravel())

In [18]:
predicts = bestModel.predict(test)
test['incidents'] = predicts

In [None]:
from sklearn.model_selection import GridSearchCV, ParameterGrid


#mean_absolute_error(y_test, predictions)
#mean_squared_error(y_test,predictions)
param_dict={
    "criterion":['gini','entropy'],
    "max_depth":range(1,20),
    "min_samples_split":range(1,20),
    "min_samples_leaf":range(1,20)
}
gs = GridSearchCV(model, param_grid = param_dict,n_jobs=-1,cv=10,scoring='accuracy',verbose=1)
gs.fit(X_train,y_train)


In [None]:
gs.best_params_
gs.best_estimator_
#gs.best_score_

In [None]:
best_model=DecisionTreeClassifier(random_state=2022,criterion='entropy',max_depth=13,min_samples_leaf=3,min_samples_split=8)
best_model.fit(X_train,y_train)

In [None]:
predicts = best_model.predict(test)
test['incidents'] = predicts

In [None]:
dt.info()

# MLP

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense,Dropout,BatchNormalization
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import GridSearchCV,KFold
from sklearn.preprocessing import MinMaxScaler




In [None]:
from tensorflow.python.client import device_lib 
print(device_lib.list_local_devices())

In [None]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))


In [None]:
import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
session = tf.Session(config=config)


In [None]:
scaler_X = MinMaxScaler(feature_range=(0,1)).fit(X_train)
#scaler_y = MinMaxScaler(feature_range=(0,1)).fit(y_train)
X_scaled = pd.DataFrame(scaler_X.transform(X_train[X_train.columns]),columns=X_train.columns)
#Y_scaled = pd.DataFrame(scaler_y.transform(y_train[y_train.columns]),columns=y_train.columns)
scaler_test = MinMaxScaler(feature_range=(0,1)).fit(test)
test_scaled = pd.DataFrame(scaler_test.transform(test[test.columns]),columns=test.columns)


In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_scaled, y_train, test_size=0.25, random_state=2022)

In [None]:
def build_model(activation = 'relu', learning_rate = 0.01):
    model = Sequential()
    model.add(Dense(12, input_dim=12,activation = activation))
    model.add(Dense(20, activation = activation))
    model.add(Dense(5, activation = 'softmax'))
    
    model.compile(
        loss = tf.keras.losses.SparseCategoricalCrossentropy(),
        optimizer = tf.optimizers.SGD(learning_rate),
        metrics = ['accuracy','sparse_categorical_accuracy']
    )
    return model

In [None]:
tuning_dict = {
    'activation' : ['relu'],
    'learning_rate' : [1,0.1,0.01,0.001]
}

In [None]:
kf = KFold(n_splits=10, shuffle=True, random_state=2022)
model = KerasClassifier(build_fn=build_model, epochs = 200, batch_size=64)
grid_search = GridSearchCV(estimator = model,
                           param_grid= tuning_dict,
                           cv = kf,
                           scoring='accuracy',
                           refit = True,
                           verbose = 1,n_jobs=-1)
grid_search.fit(X_scaled, y_train,validation_split=0.25)

In [None]:
print(grid_search.best_score_)
print(grid_search.best_params_)

In [None]:
best_model = grid_search.best_estimator_

In [None]:
best_model.fit(X_scaled,y_train,epochs=20)

In [None]:
from livelossplot import PlotLossesKerasTF

best_model.fit(X_scaled,y_train,validation_split=0.40,epochs=100,callbacks=[PlotLossesKerasTF()],verbose=1)


In [None]:
predictions = best_model.predict(test_scaled)


In [None]:
print(predictions)

In [None]:
y_test_unscaled = pd.DataFrame(scaler_test.inverse_transform(test_scaled))

In [None]:
y_test_unscaled['incidents'] = predictions

In [None]:
print(y_test_unscaled)

In [None]:
test = y_test_unscaled

# Building the output file


In [19]:
test['incidents'] =  target.inverse_transform(test[['incidents']])

In [None]:
test.info()

In [None]:
test.head()

In [20]:
test.index +=1
test.reset_index(drop=False,inplace=True)

final_csv = test.to_csv(r'finalt.csv',header=["RowId","Incidents"],columns=['index','incidents'],index=False)

In [None]:
test.tail()