In [None]:
import pandas as pd
import numpy as np

In [None]:
def blight_model():
    import pandas as pd
    import numpy as np
    from sklearn.preprocessing import LabelEncoder
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.model_selection import train_test_split, GridSearchCV
    from sklearn.metrics import roc_auc_score
    
    # Use the appropriate directory whenever it is necessary
    train_df = pd.read_csv('train.csv', encoding = "ISO-8859-1")
    test_df = pd.read_csv('test.csv')
    address_df = pd.read_csv('addresses.csv')
    latlons_df = pd.read_csv('latlons.csv')
    
    
    train_df = train_df[(train_df['compliance'] == 0) | (train_df['compliance'] == 1)]
    train_df = train_df[train_df['country'] == 'USA']
    train_df = train_df[train_df['country'] == 'USA']
    train_df = pd.merge(train_df, pd.merge(address_df, latlons_df, on = 'address'), on = 'ticket_id')
    test_df = pd.merge(test_df, pd.merge(address_df, latlons_df, on = 'address'), on = 'ticket_id')
    # Dropping the unnecessary columns
    train_df.drop(['agency_name', 'inspector_name', 'violator_name', 'violation_street_number',
                   'violation_street_name', 'violation_zip_code', 'violation_description', 
                   'admin_fee', 'state_fee', 'grafitti_status', 'ticket_issued_date',
                   'hearing_date', 'country', 'address', 
                   # mailing related columns
                   'mailing_address_str_number', 'mailing_address_str_name', 'city', 'state', 'zip_code',
                   'non_us_str_code',
                   # non-existent columns in test data
                   'payment_date', 'payment_status', 'collection_status', 'balance_due',
                   'payment_amount','compliance_detail'], axis = 1, inplace = True)
    label_encoder = LabelEncoder()
    label_encoder.fit(train_df['disposition'])
    train_df['disposition'] = label_encoder.transform(train_df['disposition'])
    test_df['disposition'] = label_encoder.fit_transform(test_df['disposition'])
    label_encoder = LabelEncoder()
    label_encoder.fit(train_df['violation_code'])
    train_df['violation_code'] = label_encoder.transform(train_df['violation_code'])
    test_df['violation_code'] = label_encoder.fit_transform(test_df['violation_code'])
    
    train_df['lat'] = train_df['lat'].fillna(train_df['lat'].mean())
    train_df['lon'] = train_df['lon'].fillna(train_df['lon'].mean())
    test_df['lat'] = test_df['lat'].fillna(test_df['lat'].mean())
    test_df['lon'] = test_df['lon'].fillna(test_df['lon'].mean())
    train_df_col_names = list(train_df.columns)
    train_df_col_names.remove('compliance')
    test_df = test_df[train_df_col_names]
    
    X_train, X_test, y_train, y_test = train_test_split(train_df.loc[:, train_df.columns != 'compliance'],
                                                   train_df['compliance'], random_state = 0)
    rfreg = RandomForestRegressor()
    grid_params = {'n_estimators': [10, 100], 'max_depth': [None, 30]}
    grid_cv = GridSearchCV(rfreg, param_grid = grid_params, scoring = 'roc_auc')
    grid_cv.fit(X_train, y_train)
    print('Grid best param: ', grid_cv.best_params_)
    print('Grid best score (max. AUC): ', grid_cv.best_score_)
    
    prediction_df = pd.DataFrame(grid_cv.predict(test_df), index = test_df.ticket_id)
    prediction_df = prediction_df.rename(columns = {0: 'probs. of fine payment'})
    
    return prediction_df

In [None]:
blight_model()