The first part is done by Haoquan Fang.

# Import Libraries

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
import json
from math import radians, cos, sin, asin, sqrt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold as KF, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

# Data Preprocessing

In [2]:
def get_locations():
    locations = []
    dir_list = os.listdir('/kaggle/input/inrix-hack-dataset/lots')
    
    for i in range(len(dir_list)):
        if (dir_list[i][-4:] == 'json'):
            with open('/kaggle/input/inrix-hack-dataset/lots/' + dir_list[i], 'r') as file:
                # Load JSON data into a Python variable
                data = json.load(file)
            
            for k in range(len(data['result'])):
                if len(data['result'][k]['peps']) == 1:
                    locations.append(data['result'][k]['peps'][0]['pepPt'])
                else:
                    for j in range(len(data['result'][k]['peps'])):
                        if data['result'][k]['peps'][j]['pepPrimary']:
                            locations.append(data['result'][k]['peps'][j]['pepPt'])
                            break
    df = pd.DataFrame(locations, columns=['long','lat'])                        
    return df

## Get Parking Lots Features

In [3]:
def get_one_lots(data):
    df = pd.DataFrame({
        'id': [],
        'pct': [],
        'probability': [],
#         'rank': [],
#         'bucket': [],
        'available': [],
        'distance': [],
        'price': [],
        'stars': [],
        'long': [],
        'lat': [],
    })
#     df.columns = ['pct', 'probability', 'rank', 'bucket', 'available', 'distance', 'price', 'stars']
    
    for i in range(len(data['result'])):
        l = []
        l.append(data['result'][i]['id'])
        l.append(data['result'][i]['occupancy']['pct'])
        
        l.append(data['result'][i]['occupancy']['probability'])
#         l.append(data['result'][i]['occupancy']['rank'])
#         l.append(data['result'][i]['occupancy']['bucket'])
        l.append(data['result'][i]['occupancy']['available'])
        l.append(data['result'][i]['distance'])
        
        if data['result'][i]['costIndex'] is None: 
            l.append(-1) 
        else: 
            l.append(data['result'][i]['costIndex'])
        
        if len(data['result'][i]['reviews']) == 0: 
            l.append(-1)
        else:
            score = 0.0
            for j in range(len(data['result'][i]['reviews'])):
                score += data['result'][i]['reviews'][j]['stars']
            score /= len(data['result'][i]['reviews'])
            l.append(score)
            
        if len(data['result'][i]['peps']) == 1:
            l.append(data['result'][i]['peps'][0]['pepPt'][0])
            l.append(data['result'][i]['peps'][0]['pepPt'][1])
        else:
            for j in range(len(data['result'][i]['peps'])):
                if data['result'][i]['peps'][j]['pepPrimary']:
                    l.append(data['result'][i]['peps'][j]['pepPt'][0])
                    l.append(data['result'][i]['peps'][j]['pepPt'][1])
                    break
        
        df.loc[i] = l
    
    return df

In [4]:
def get_all_lots():
    df = pd.DataFrame({
        'id': [],
        'pct': [],
        'probability': [],
#         'rank': [],
#         'bucket': [],
        'available': [],
        'distance': [],
        'price': [],
        'stars': [],
        'long': [],
        'lat': [],
    })
    
    dir_list = os.listdir('/kaggle/input/inrix-hack-dataset/lots')
    
    for i in range(len(dir_list)):
        with open(('/kaggle/input/inrix-hack-dataset/lots/' + dir_list[i]), 'r') as file:
            # Load JSON data into a Python variable
            data = json.load(file)
        df_temp = get_one_lots(data)
        df = pd.concat([df, df_temp], ignore_index=True)
    
    df = df.drop_duplicates()
    df = df.reset_index()
    df = df.drop(columns=['index'])
    
    return df

In [5]:
lots = get_all_lots()
lots

Unnamed: 0,id,pct,probability,available,distance,price,stars,long,lat
0,22110.0,41.0,83.0,61.0,843.0,3.0,-1.0,-122.419643,37.771486
1,27080.0,53.0,71.0,15.0,848.0,1.0,-1.0,-122.422446,37.772059
2,113004.0,40.0,84.0,34.0,905.0,3.0,-1.0,-122.420145,37.771240
3,28924.0,36.0,87.0,569.0,946.0,3.0,1.0,-122.415603,37.771280
4,401837.0,42.0,82.0,14.0,916.0,0.0,-1.0,-122.419293,37.787776
...,...,...,...,...,...,...,...,...,...
3512,27441.0,35.0,87.0,7.0,936.0,4.0,-1.0,-122.410793,37.792225
3513,84299.0,50.0,75.0,30.0,879.0,-1.0,-1.0,-122.411830,37.792512
3514,28169.0,39.0,84.0,6.0,875.0,-1.0,-1.0,-122.382103,37.738597
3515,27348.0,45.0,79.0,22.0,399.0,1.0,-1.0,-122.385939,37.744220


In [6]:
lots.to_csv('parking.csv', index=False)

## Get Safety Alerts Features

In [7]:
def get_one_incidents(data):
    
    df = pd.DataFrame({
        'id': [],
        'type': [],
        'severity': [],
        'long': [],
        'lat': [],
    })
    
    for i in range(len(data['result']['incidents'])):
        l = []
        l.append(data['result']['incidents'][i]['id'])
        l.append(data['result']['incidents'][i]['type'])
        l.append(data['result']['incidents'][i]['severity'])
        l.append(data['result']['incidents'][i]['geometry']['coordinates'][0])
        l.append(data['result']['incidents'][i]['geometry']['coordinates'][1])
        df.loc[i] = l
    
    return df

In [8]:
def get_all_incidents():
    
    df = pd.DataFrame({
        'id': [],
        'type': [],
        'severity': [],
        'long': [],
        'lat': [],
    })
    
    dir_list = os.listdir('/kaggle/input/inrix-hack-dataset/incidents')
    
    for i in range(len(dir_list)):
        with open(('/kaggle/input/inrix-hack-dataset/incidents/' + dir_list[i]), 'r') as file:
            # Load JSON data into a Python variable
            data = json.load(file)
        df_temp = get_one_incidents(data)
        df = pd.concat([df, df_temp], ignore_index=True)
    
    df = df.drop_duplicates()
    df = df.reset_index()
    df = df.drop(columns=['index'])
    
    return df

In [9]:
incidents = get_all_incidents()
incidents

Unnamed: 0,id,type,severity,long,lat
0,151490618.0,1,0,-120.872314,35.793649
1,153229881.0,1,1,-117.301650,37.045113
2,155128866.0,1,2,-122.456690,38.553080
3,159703362.0,1,2,-121.104400,38.938570
4,162192277.0,1,1,-119.564920,36.137970
...,...,...,...,...,...
892,183131859.0,3,2,-122.250760,37.813941
893,183131901.0,3,2,-121.901932,37.371972
894,183132128.0,3,1,-122.228586,37.773332
895,183132129.0,3,2,-121.979902,37.517277


## Combination

In [10]:
def earth_distance(lat1, lat2, lon1, lon2):
     
    # The math module contains a function named
    # radians which converts from degrees to radians.
    lon1 = radians(lon1)
    lon2 = radians(lon2)
    lat1 = radians(lat1)
    lat2 = radians(lat2)
      
    # Haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1
    a = sin(dlat / 2)**2 + cos(lat1) * cos(lat2) * sin(dlon / 2)**2
 
    c = 2 * asin(sqrt(a)) 
    
    # Radius of earth in kilometers. Use 3956 for miles
    r = 6371
      
    # calculate the result
    return(c * r)

In [11]:
def add_safety(lots, incidents):
    df = lots.copy(deep=False)
    df['construction'] = [0] * len(df)
    df['events'] = [0] * len(df)
    df['congestion'] = [0] * len(df)
    df['hazards'] = [0] * len(df)
    type_dict = {1: 'construction', 2: 'events', 3: 'congestion', 4: 'hazards'}
    
    for i in range(len(lots)):
        for j in range(len(incidents)):
            if earth_distance(lots['lat'][i], incidents['lat'][j], lots['long'][i], incidents['long'][j]) <= 0.5:
                df.loc[i, type_dict[int(incidents['type'][j])]] += int(incidents['severity'][j])
    
    return df

In [12]:
safety = add_safety(lots, incidents)
safety

Unnamed: 0,id,pct,probability,available,distance,price,stars,long,lat,construction,events,congestion,hazards
0,22110.0,41.0,83.0,61.0,843.0,3.0,-1.0,-122.419643,37.771486,2,0,5,0
1,27080.0,53.0,71.0,15.0,848.0,1.0,-1.0,-122.422446,37.772059,0,0,5,0
2,113004.0,40.0,84.0,34.0,905.0,3.0,-1.0,-122.420145,37.771240,2,0,5,0
3,28924.0,36.0,87.0,569.0,946.0,3.0,1.0,-122.415603,37.771280,6,0,0,0
4,401837.0,42.0,82.0,14.0,916.0,0.0,-1.0,-122.419293,37.787776,10,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
3512,27441.0,35.0,87.0,7.0,936.0,4.0,-1.0,-122.410793,37.792225,4,0,0,0
3513,84299.0,50.0,75.0,30.0,879.0,-1.0,-1.0,-122.411830,37.792512,0,0,0,0
3514,28169.0,39.0,84.0,6.0,875.0,-1.0,-1.0,-122.382103,37.738597,0,0,0,0
3515,27348.0,45.0,79.0,22.0,399.0,1.0,-1.0,-122.385939,37.744220,0,0,0,0


In [13]:
safety = safety.drop(columns=['id','long','lat'])
safety

Unnamed: 0,pct,probability,available,distance,price,stars,construction,events,congestion,hazards
0,41.0,83.0,61.0,843.0,3.0,-1.0,2,0,5,0
1,53.0,71.0,15.0,848.0,1.0,-1.0,0,0,5,0
2,40.0,84.0,34.0,905.0,3.0,-1.0,2,0,5,0
3,36.0,87.0,569.0,946.0,3.0,1.0,6,0,0,0
4,42.0,82.0,14.0,916.0,0.0,-1.0,10,0,0,0
...,...,...,...,...,...,...,...,...,...,...
3512,35.0,87.0,7.0,936.0,4.0,-1.0,4,0,0,0
3513,50.0,75.0,30.0,879.0,-1.0,-1.0,0,0,0,0
3514,39.0,84.0,6.0,875.0,-1.0,-1.0,0,0,0,0
3515,45.0,79.0,22.0,399.0,1.0,-1.0,0,0,0,0


In [14]:
safety.to_csv('parking_safe.csv', index=False)

## Read GPT-Labeled Dataset

In [15]:
parking = pd.read_csv('/kaggle/input/inrix-hack-dataset/parking_safe_labeled.csv')
parking

Unnamed: 0,pct,probability,available,distance,price,stars,construction,events,congestion,hazards,rating
0,41.0,83.0,61.0,843.0,3.000000,-0.011444,2,0,5,0,1
1,53.0,71.0,15.0,848.0,1.000000,-0.011444,0,0,5,0,1
2,40.0,84.0,34.0,905.0,3.000000,-0.011444,2,0,5,0,1
3,36.0,87.0,569.0,946.0,3.000000,1.000000,6,0,0,0,3
4,42.0,82.0,14.0,916.0,0.000000,-0.011444,10,0,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...
3512,35.0,87.0,7.0,936.0,4.000000,-0.011444,4,0,0,0,1
3513,50.0,75.0,30.0,879.0,1.779073,-0.011444,0,0,0,0,2
3514,39.0,84.0,6.0,875.0,1.779073,-0.011444,0,0,0,0,2
3515,45.0,79.0,22.0,399.0,1.000000,-0.011444,0,0,0,0,5


In [16]:
print('the min rating is: ' + str(parking['rating'].min()))
print('the max rating is: ' + str(parking['rating'].max()))
print('the mean of rating is: ' + str(round(parking['rating'].mean(),2)))
print('the sd of rating is: ' + str(round(parking['rating'].std(),2)))

the min rating is: 1
the max rating is: 5
the mean of rating is: 3.0
the sd of rating is: 1.41


## Scale Features

In [17]:
def scale_data(data):
    temp = data.copy(deep=False)
    std_slc = StandardScaler()
    preprocess = std_slc.fit_transform(temp[['pct', 'probability', 'available', 'distance', 'price', 'stars', 'construction', 'events', 'congestion', 'hazards']])
    data_scaled = pd.DataFrame(preprocess, columns=['pct', 'probability', 'available', 'distance', 'price', 'stars', 'construction', 'events', 'congestion', 'hazards'])
    data_scaled['rating'] = data['rating']
    return data_scaled, std_slc.scale_, std_slc.mean_

In [18]:
parking_scaled, std, mean = scale_data(parking)
parking_scaled

Unnamed: 0,pct,probability,available,distance,price,stars,construction,events,congestion,hazards,rating
0,0.404868,-0.271315,-0.151143,0.828375,0.864196,-0.506208,-0.618060,0.0,1.712472,0.0,1
1,2.477784,-2.877270,-0.464425,0.849317,-1.079892,-0.506208,-0.785450,0.0,1.712472,0.0,1
2,0.232125,-0.054152,-0.335026,1.088055,0.864196,-0.506208,-0.618060,0.0,1.712472,0.0,1
3,-0.458847,0.597337,3.308578,1.259779,0.864196,0.206451,-0.283282,0.0,-0.305069,0.0,3
4,0.577611,-0.488478,-0.471236,1.134127,-2.051936,-0.506208,0.051497,0.0,-0.305069,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...
3512,-0.631590,0.597337,-0.518909,1.217895,1.836240,-0.506208,-0.450671,0.0,-0.305069,0.0,1
3513,1.959555,-2.008618,-0.362268,0.979157,-0.322599,-0.506208,-0.785450,0.0,-0.305069,0.0,2
3514,0.059382,-0.054152,-0.525720,0.962404,-0.322599,-0.506208,-0.785450,0.0,-0.305069,0.0,2
3515,1.095840,-1.139967,-0.416752,-1.031266,-1.079892,-0.506208,-0.785450,0.0,-0.305069,0.0,5


In [19]:
parking_scaled.to_csv('parking_safe_labeled_scaled.csv', index=False)

In [20]:
std

array([  5.78894742,   4.60483703, 146.83262808, 238.75573511,
         1.02875985,   1.41925462,  11.94819279,   1.        ,
         2.478264  ,   1.        ])

In [21]:
mean

array([ 38.65624111,  84.24936025,  83.19277794, 645.22064259,
         2.11094987,   0.70699405,   9.38470287,   0.        ,
         0.75604208,   0.        ])

In [22]:
temp = [1,1,1,1,1,1,1,1,1,1]
(temp - mean) / std

array([ -6.50485112, -18.07867678,  -0.55977189,  -2.69824154,
        -1.07989232,   0.20645059,  -0.7017549 ,   1.        ,
         0.09843904,   1.        ])

In [23]:
def process_input(data):
    std = np.array([  5.78894742,   4.60483703, 146.83262808, 238.75573511,
         1.02875985,   1.41925462,  11.94819279,   1.        ,
         2.478264  ,   1.        ])
    mean = np.array([ 38.65624111,  84.24936025,  83.19277794, 645.22064259,
         2.11094987,   0.70699405,   9.38470287,   0.        ,
         0.75604208,   0.        ])
    
    data = (data - mean) / std
    df = pd.DataFrame([data])
    return df

In [24]:
process_input([50,70,5,300,5,4,3,0,2,0])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,1.959555,-3.094433,-0.53253,-1.445916,2.808284,2.320236,-0.534366,0.0,0.501947,0.0


The following is done by Yuekai Xu

# Traning and Validation

In [25]:
data = pd.read_csv('/kaggle/input/inrix-hack-dataset/parking_safe_labeled_scaled.csv')

In [26]:
X = data[['pct', 'probability', 'available', 'distance', 
                 'price', 'stars','construction', 'events', 'congestion', 'hazards']]
y = data['rating']
y

0       1
1       1
2       1
3       3
4       4
       ..
3512    1
3513    2
3514    2
3515    5
3516    2
Name: rating, Length: 3517, dtype: int64

In [27]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [28]:
kf = KF(n_splits = 10, shuffle=True, random_state=42)

In [29]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42)
}

for name, model in models.items():
    mse_list = []
    count = 1
    for train_index, test_index in kf.split(X): 
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mse_list.append(mse)
        print(f"{name} Mean Squared Error: {mse}, KF: {count}")
        count += 1

    # Aggregate performance
    avg_mse = sum(mse_list) / len(mse_list)
    print(f"{name} Average Mean Squared Error: {avg_mse}\n")

Linear Regression Mean Squared Error: 0.25602241685658605, KF: 1
Linear Regression Mean Squared Error: 0.3034685811552478, KF: 2
Linear Regression Mean Squared Error: 0.23144467408834699, KF: 3
Linear Regression Mean Squared Error: 0.27120821373009324, KF: 4
Linear Regression Mean Squared Error: 0.3113689208964049, KF: 5
Linear Regression Mean Squared Error: 0.27276259263395813, KF: 6
Linear Regression Mean Squared Error: 0.2666251848547833, KF: 7
Linear Regression Mean Squared Error: 0.3412158335717064, KF: 8
Linear Regression Mean Squared Error: 0.29475909804191786, KF: 9
Linear Regression Mean Squared Error: 0.3268518575238732, KF: 10
Linear Regression Average Mean Squared Error: 0.28757273733529176

Random Forest Mean Squared Error: 0.10620709219858157, KF: 1
Random Forest Mean Squared Error: 0.09762021276595745, KF: 2
Random Forest Mean Squared Error: 0.09248120567375887, KF: 3
Random Forest Mean Squared Error: 0.10131103202846975, KF: 4
Random Forest Mean Squared Error: 0.0950395

In [30]:
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

model_filename = 'randomForest.joblib'
dump(model, model_filename)

['randomForest.joblib']