### Import the Library

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LinearRegression

### Load the data

In [4]:
train = pd.read_csv('train.csv', encoding = "ISO-8859-1")
test = pd.read_csv('test.csv')
address = pd.read_csv('addresses.csv')
latlons = pd.read_csv('latlons.csv')  

  interactivity=interactivity, compiler=compiler, result=result)


### Clean the data

#### Sneak peak into the data

In [5]:
train.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,...,clean_up_cost,judgment_amount,payment_amount,balance_due,payment_date,payment_status,collection_status,grafitti_status,compliance_detail,compliance
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900.0,TYLER,,3.0,S. WICKER,CHICAGO,...,0.0,305.0,0.0,305.0,,NO PAYMENT APPLIED,,,non-compliant by no payment,0.0
1,27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,,2959.0,Martin Luther King,Detroit,...,0.0,855.0,780.0,75.0,2005-06-02 00:00:00,PAID IN FULL,,,compliant by late payment within 1 month,1.0
2,22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","SANDERS, DERRON",1449.0,LONGFELLOW,,23658.0,P.O. BOX,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
3,22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MOROSI, MIKE",1441.0,LONGFELLOW,,5.0,ST. CLAIR,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
4,22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","NATHANIEL, NEAL",2449.0,CHURCHILL,,7449.0,CHURCHILL,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,


In [6]:
test.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,...,violation_description,disposition,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,grafitti_status
0,284932,Department of Public Works,"Granberry, Aisha B","FLUELLEN, JOHN A",10041.0,ROSEBERRY,,141,ROSEBERRY,DETROIT,...,Failure to secure City or Private solid waste ...,Responsible by Default,200.0,20.0,10.0,20.0,0.0,0.0,250.0,
1,285362,Department of Public Works,"Lusk, Gertrina","WHIGHAM, THELMA",18520.0,EVERGREEN,,19136,GLASTONBURY,DETROIT,...,Allowing bulk solid waste to lie or accumulate...,Responsible by Default,1000.0,20.0,10.0,100.0,0.0,0.0,1130.0,
2,285361,Department of Public Works,"Lusk, Gertrina","WHIGHAM, THELMA",18520.0,EVERGREEN,,19136,GLASTONBURY,DETROIT,...,Improper placement of Courville container betw...,Responsible by Default,100.0,20.0,10.0,10.0,0.0,0.0,140.0,
3,285338,Department of Public Works,"Talbert, Reginald","HARABEDIEN, POPKIN",1835.0,CENTRAL,,2246,NELSON,WOODHAVEN,...,Allowing bulk solid waste to lie or accumulate...,Responsible by Default,200.0,20.0,10.0,20.0,0.0,0.0,250.0,
4,285346,Department of Public Works,"Talbert, Reginald","CORBELL, STANLEY",1700.0,CENTRAL,,3435,MUNGER,LIVONIA,...,Violation of time limit for approved container...,Responsible by Default,100.0,20.0,10.0,10.0,0.0,0.0,140.0,


In [7]:
address.head()

Unnamed: 0,ticket_id,address
0,22056,"2900 tyler, Detroit MI"
1,27586,"4311 central, Detroit MI"
2,22062,"1449 longfellow, Detroit MI"
3,22084,"1441 longfellow, Detroit MI"
4,22093,"2449 churchill, Detroit MI"


In [8]:
latlons.head()

Unnamed: 0,address,lat,lon
0,"4300 rosa parks blvd, Detroit MI 48208",42.346169,-83.079962
1,"14512 sussex, Detroit MI",42.394657,-83.194265
2,"3456 garland, Detroit MI",42.373779,-82.986228
3,"5787 wayburn, Detroit MI",42.403342,-82.957805
4,"5766 haverhill, Detroit MI",42.407255,-82.946295


In [9]:
set(train['country'])

{'Aust', 'Cana', 'Egyp', 'Germ', 'USA'}

In [10]:
train = train[train.country == 'USA']

In [11]:
train.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,...,clean_up_cost,judgment_amount,payment_amount,balance_due,payment_date,payment_status,collection_status,grafitti_status,compliance_detail,compliance
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900.0,TYLER,,3.0,S. WICKER,CHICAGO,...,0.0,305.0,0.0,305.0,,NO PAYMENT APPLIED,,,non-compliant by no payment,0.0
1,27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,,2959.0,Martin Luther King,Detroit,...,0.0,855.0,780.0,75.0,2005-06-02 00:00:00,PAID IN FULL,,,compliant by late payment within 1 month,1.0
2,22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","SANDERS, DERRON",1449.0,LONGFELLOW,,23658.0,P.O. BOX,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
3,22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MOROSI, MIKE",1441.0,LONGFELLOW,,5.0,ST. CLAIR,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,
4,22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","NATHANIEL, NEAL",2449.0,CHURCHILL,,7449.0,CHURCHILL,DETROIT,...,0.0,0.0,0.0,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,


In [12]:
test = test[test.country == "USA"]

In [13]:
test.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,...,violation_description,disposition,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,grafitti_status
0,284932,Department of Public Works,"Granberry, Aisha B","FLUELLEN, JOHN A",10041.0,ROSEBERRY,,141,ROSEBERRY,DETROIT,...,Failure to secure City or Private solid waste ...,Responsible by Default,200.0,20.0,10.0,20.0,0.0,0.0,250.0,
1,285362,Department of Public Works,"Lusk, Gertrina","WHIGHAM, THELMA",18520.0,EVERGREEN,,19136,GLASTONBURY,DETROIT,...,Allowing bulk solid waste to lie or accumulate...,Responsible by Default,1000.0,20.0,10.0,100.0,0.0,0.0,1130.0,
2,285361,Department of Public Works,"Lusk, Gertrina","WHIGHAM, THELMA",18520.0,EVERGREEN,,19136,GLASTONBURY,DETROIT,...,Improper placement of Courville container betw...,Responsible by Default,100.0,20.0,10.0,10.0,0.0,0.0,140.0,
3,285338,Department of Public Works,"Talbert, Reginald","HARABEDIEN, POPKIN",1835.0,CENTRAL,,2246,NELSON,WOODHAVEN,...,Allowing bulk solid waste to lie or accumulate...,Responsible by Default,200.0,20.0,10.0,20.0,0.0,0.0,250.0,
4,285346,Department of Public Works,"Talbert, Reginald","CORBELL, STANLEY",1700.0,CENTRAL,,3435,MUNGER,LIVONIA,...,Violation of time limit for approved container...,Responsible by Default,100.0,20.0,10.0,10.0,0.0,0.0,140.0,


#### Merging data

In [14]:
add_with_latlon = pd.merge(address, latlons, on = 'address')


In [15]:
add_with_latlon.head()

Unnamed: 0,ticket_id,address,lat,lon
0,22056,"2900 tyler, Detroit MI",42.390729,-83.124268
1,77242,"2900 tyler, Detroit MI",42.390729,-83.124268
2,77243,"2900 tyler, Detroit MI",42.390729,-83.124268
3,103945,"2900 tyler, Detroit MI",42.390729,-83.124268
4,138219,"2900 tyler, Detroit MI",42.390729,-83.124268


In [16]:
train = pd.merge(train, add_with_latlon, on = 'ticket_id')

In [17]:
test = pd.merge(test, add_with_latlon, on = 'ticket_id')

In [18]:
train.shape

(250293, 37)

In [19]:
test.shape

(61001, 30)

In [20]:
train.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,violation_zip_code,mailing_address_str_number,mailing_address_str_name,city,...,balance_due,payment_date,payment_status,collection_status,grafitti_status,compliance_detail,compliance,address,lat,lon
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900.0,TYLER,,3.0,S. WICKER,CHICAGO,...,305.0,,NO PAYMENT APPLIED,,,non-compliant by no payment,0.0,"2900 tyler, Detroit MI",42.390729,-83.124268
1,27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,,2959.0,Martin Luther King,Detroit,...,75.0,2005-06-02 00:00:00,PAID IN FULL,,,compliant by late payment within 1 month,1.0,"4311 central, Detroit MI",42.326937,-83.135118
2,22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","SANDERS, DERRON",1449.0,LONGFELLOW,,23658.0,P.O. BOX,DETROIT,...,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,,"1449 longfellow, Detroit MI",42.380516,-83.096069
3,22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MOROSI, MIKE",1441.0,LONGFELLOW,,5.0,ST. CLAIR,DETROIT,...,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,,"1441 longfellow, Detroit MI",42.38057,-83.095919
4,22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","NATHANIEL, NEAL",2449.0,CHURCHILL,,7449.0,CHURCHILL,DETROIT,...,0.0,,NO PAYMENT APPLIED,,,not responsible by disposition,,"2449 churchill, Detroit MI",42.145257,-83.208233


In [21]:
train.isna().mean()*100

ticket_id                       0.000000
agency_name                     0.000000
inspector_name                  0.000000
violator_name                   0.013584
violation_street_number         0.000000
violation_street_name           0.000000
violation_zip_code            100.000000
mailing_address_str_number      1.438314
mailing_address_str_name        0.001598
city                            0.000000
state                           0.035159
zip_code                        0.000400
non_us_str_code               100.000000
country                         0.000000
ticket_issued_date              0.000000
hearing_date                    4.990551
violation_code                  0.000000
violation_description           0.000000
disposition                     0.000000
fine_amount                     0.000400
admin_fee                       0.000000
state_fee                       0.000000
late_fee                        0.000000
discount_amount                 0.000000
clean_up_cost   

##### Drop all the column with considerable amount of NA values (>50%)

In [22]:
train = train.drop(['violation_zip_code', 'non_us_str_code', 'payment_date','collection_status', 'grafitti_status'], axis = 1)

In [23]:
train.head()

Unnamed: 0,ticket_id,agency_name,inspector_name,violator_name,violation_street_number,violation_street_name,mailing_address_str_number,mailing_address_str_name,city,state,...,clean_up_cost,judgment_amount,payment_amount,balance_due,payment_status,compliance_detail,compliance,address,lat,lon
0,22056,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","INVESTMENT INC., MIDWEST MORTGAGE",2900.0,TYLER,3.0,S. WICKER,CHICAGO,IL,...,0.0,305.0,0.0,305.0,NO PAYMENT APPLIED,non-compliant by no payment,0.0,"2900 tyler, Detroit MI",42.390729,-83.124268
1,27586,"Buildings, Safety Engineering & Env Department","Williams, Darrin","Michigan, Covenant House",4311.0,CENTRAL,2959.0,Martin Luther King,Detroit,MI,...,0.0,855.0,780.0,75.0,PAID IN FULL,compliant by late payment within 1 month,1.0,"4311 central, Detroit MI",42.326937,-83.135118
2,22062,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","SANDERS, DERRON",1449.0,LONGFELLOW,23658.0,P.O. BOX,DETROIT,MI,...,0.0,0.0,0.0,0.0,NO PAYMENT APPLIED,not responsible by disposition,,"1449 longfellow, Detroit MI",42.380516,-83.096069
3,22084,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","MOROSI, MIKE",1441.0,LONGFELLOW,5.0,ST. CLAIR,DETROIT,MI,...,0.0,0.0,0.0,0.0,NO PAYMENT APPLIED,not responsible by disposition,,"1441 longfellow, Detroit MI",42.38057,-83.095919
4,22093,"Buildings, Safety Engineering & Env Department","Sims, Martinzie","NATHANIEL, NEAL",2449.0,CHURCHILL,7449.0,CHURCHILL,DETROIT,MI,...,0.0,0.0,0.0,0.0,NO PAYMENT APPLIED,not responsible by disposition,,"2449 churchill, Detroit MI",42.145257,-83.208233


##### Remove all columns from train and test which are irrelevent

In [24]:
train.columns

Index(['ticket_id', 'agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'mailing_address_str_number', 'mailing_address_str_name', 'city',
       'state', 'zip_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'payment_amount', 'balance_due',
       'payment_status', 'compliance_detail', 'compliance', 'address', 'lat',
       'lon'],
      dtype='object')

In [25]:
test.columns

Index(['ticket_id', 'agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'violation_zip_code', 'mailing_address_str_number',
       'mailing_address_str_name', 'city', 'state', 'zip_code',
       'non_us_str_code', 'country', 'ticket_issued_date', 'hearing_date',
       'violation_code', 'violation_description', 'disposition', 'fine_amount',
       'admin_fee', 'state_fee', 'late_fee', 'discount_amount',
       'clean_up_cost', 'judgment_amount', 'grafitti_status', 'address', 'lat',
       'lon'],
      dtype='object')

In [26]:
train = train.drop(['agency_name', 'inspector_name', 'violator_name',
       'violation_street_number', 'violation_street_name',
       'mailing_address_str_number', 'mailing_address_str_name', 'city',
       'state', 'zip_code', 'country','payment_amount', 'balance_due', 
        'payment_status', 'compliance_detail','hearing_date', 'ticket_issued_date','violation_description', 'address'], axis = 1)

In [27]:
train.isnull().values.any()

True

In [28]:
train.isnull().sum().sum()

90431

In [29]:
train = train.dropna()

In [30]:
train.head()

Unnamed: 0,ticket_id,violation_code,disposition,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,compliance,lat,lon
0,22056,9-1-36(a),Responsible by Default,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0,42.390729,-83.124268
1,27586,61-63.0600,Responsible by Determination,750.0,20.0,10.0,75.0,0.0,0.0,855.0,1.0,42.326937,-83.135118
5,22046,9-1-36(a),Responsible by Default,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0,42.145257,-83.208233
6,18738,61-63.0500,Responsible by Default,750.0,20.0,10.0,75.0,0.0,0.0,855.0,0.0,42.433466,-83.023493
7,18735,61-63.0100,Responsible by Default,100.0,20.0,10.0,10.0,0.0,0.0,140.0,0.0,42.388641,-83.037858


In [31]:
    string_features = ['disposition', 'violation_code']
    train =  pd.get_dummies(train,columns = string_features, drop_first= True)
    test =  pd.get_dummies(test,columns = string_features, drop_first= True)

In [32]:
train = train[(train['compliance'] == 0) | (train['compliance'] == 1)]

In [33]:
set(train['compliance'])

{0.0, 1.0}

In [34]:
train.head()

Unnamed: 0,ticket_id,fine_amount,admin_fee,state_fee,late_fee,discount_amount,clean_up_cost,judgment_amount,compliance,lat,...,violation_code_9-1-81(a),violation_code_9-1-81(b),violation_code_9-1-81(e),violation_code_9-1-82(b),violation_code_9-1-82(d) - (Buildin,violation_code_9-1-82(d) - (Dwellin,violation_code_9-1-82(d) - (Structu,violation_code_9-1-83 - (Building 5,violation_code_9-1-83 - (Dwelling),violation_code_9-1-83 - (Structures
0,22056,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0,42.390729,...,0,0,0,0,0,0,0,0,0,0
1,27586,750.0,20.0,10.0,75.0,0.0,0.0,855.0,1.0,42.326937,...,0,0,0,0,0,0,0,0,0,0
5,22046,250.0,20.0,10.0,25.0,0.0,0.0,305.0,0.0,42.145257,...,0,0,0,0,0,0,0,0,0,0
6,18738,750.0,20.0,10.0,75.0,0.0,0.0,855.0,0.0,42.433466,...,0,0,0,0,0,0,0,0,0,0
7,18735,100.0,20.0,10.0,10.0,0.0,0.0,140.0,0.0,42.388641,...,0,0,0,0,0,0,0,0,0,0


In [35]:
X = train.drop('compliance', axis =1)

In [36]:
y = train['compliance']

In [37]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [38]:
linear = LinearRegression().fit(X_train, y_train)

prob = linear.predict(X_test)

In [39]:
prob

array([0.0340746 , 0.03495869, 0.03723484, ..., 0.03427438, 0.01857447,
       0.1881931 ])

In [40]:
roc_auc_score(y_test,prob)

0.7986139318597673

In [41]:
reg = RandomForestRegressor(max_depth = 6,random_state=0).fit(X_train, y_train)

prob2 = reg.predict(X_test)

In [42]:
prob2

array([0.05373968, 0.029088  , 0.03165829, ..., 0.01533792, 0.01511638,
       0.91914439])

In [43]:
roc_auc_score(y_test, prob2)

0.8199895788179881

In [44]:
prob2.shape 

(39967,)