In [12]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve, auc
from sklearn.model_selection import GridSearchCV
from datetime import datetime

#We have previosly split the data into 2 files, named train.csv and test.csv, to avoid data leakage.
    
df = pd.read_csv('train.csv',  encoding = 'L1')
df.set_index ('ticket_id', inplace=True)

#print(df.columns)

#First, we'll discard all NaN in the Compliance column, and drop the columns that have almost no info other than NaN
    
df = df[df['compliance'].notnull()]    
df=df[[ 'fine_amount', 'admin_fee', 'state_fee', 'late_fee', 'discount_amount', 'compliance']]# 'ticket_issued_date', 'hearing_date', were removed because they didn't add much  


#df['violation_street_name'] = df['violation_street_name'].str.lower()
#df['city'] = df['city'].str.lower()
#df['state'] = df['state'].str.lower()
#df['violation_code'] = df['violation_code'].str.lower()
#df['ticket_issued_date'] = pd.to_datetime(df['ticket_issued_date']).timestamp()
#df['hearing_date'] =  df['hearing_date'].apply(lambda x: x.timestamp())
    
#now we clear the NaN values    
X=df.iloc[:,:-1]
X=X.dropna()

y=df['compliance']
    
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)


#Given the features of the data I'll try to use a random forest classifier, and we'll set the AUC of the ROC to measure it's performance.

Rforest_clf = RandomForestClassifier(random_state = 0).fit(X_train, y_train)
y_score = Rforest_clf.predict(X_test)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
print('roc_auc:  ',roc_auc)

roc_auc:   0.5509464105313632


In [2]:
Rforest_clf = RandomForestClassifier(random_state = 0)
grid_values={'n_estimators': [40, 50, 60, 70, 80, 90, 100, 500, 1000]}    
grid_clf_auc = GridSearchCV(Rforest_clf, param_grid = grid_values , n_jobs=-1, scoring = 'roc_auc')
grid_clf_auc.fit(X_train, y_train)
y_decision_scores_auc = grid_clf_auc.predict(X_test)    
    
print('Test set AUC: ', roc_auc_score(y_test, y_decision_scores_auc))
print('Grid best parameter (max. AUC): ', grid_clf_auc.best_params_)
print('Grid best score (AUC): ', grid_clf_auc.best_score_)


#Results#    
#Test set AUC:  0.5504547085031839
#Grid best parameter (max. AUC):  {'n_estimators': 1000}
#Grid best score (AUC):  0.7582558031398563

Test set AUC:  0.5504547085031839
Grid best parameter (max. AUC):  {'n_estimators': 1000}
Grid best score (AUC):  0.7582558031398563


In [3]:
Rforest_clf = RandomForestClassifier(n_estimators = 1000).fit(X_train, y_train)
y_score = Rforest_clf.predict(X_test)
fpr, tpr, _ = roc_curve(y_test, y_score)
roc_auc = auc(fpr, tpr)
print('roc_auc:  ',roc_auc)


roc_auc:   0.5509464105313632
