In [75]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/parking/parking_safe_labeled_scaled.csv


In [76]:
from sklearn.model_selection import KFold as KF, train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error

In [77]:
data = pd.read_csv('/kaggle/input/parking/parking_safe_labeled_scaled.csv')

In [78]:
X = data[['pct', 'probability', 'available', 'distance', 
                 'price', 'stars','construction', 'events', 'congestion', 'hazards']]
y = data['rating']
y

0       1
1       1
2       1
3       3
4       4
       ..
3512    1
3513    2
3514    2
3515    5
3516    2
Name: rating, Length: 3517, dtype: int64

In [79]:
X.shape

(3517, 10)

In [80]:
X, X_test, y, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [81]:
kf = KF(n_splits = 10, shuffle=True, random_state=42)

In [82]:
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "Decision Tree": DecisionTreeRegressor(random_state=42)
}

for name, model in models.items():
    mse_list = []
    count = 1
    for train_index, test_index in kf.split(X): 
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        mse = mean_squared_error(y_test, y_pred)
        mse_list.append(mse)
        print(f"{name} Mean Squared Error: {mse}, KF: {count}")
        count += 1

    # Aggregate performance
    avg_mse = sum(mse_list) / len(mse_list)
    print(f"{name} Average Mean Squared Error: {avg_mse}\n")


Linear Regression Mean Squared Error: 0.25602241685658605, KF: 1
Linear Regression Mean Squared Error: 0.3034685811552478, KF: 2
Linear Regression Mean Squared Error: 0.23144467408834699, KF: 3
Linear Regression Mean Squared Error: 0.27120821373009324, KF: 4
Linear Regression Mean Squared Error: 0.3113689208964049, KF: 5
Linear Regression Mean Squared Error: 0.27276259263395813, KF: 6
Linear Regression Mean Squared Error: 0.2666251848547833, KF: 7
Linear Regression Mean Squared Error: 0.3412158335717064, KF: 8
Linear Regression Mean Squared Error: 0.29475909804191786, KF: 9
Linear Regression Mean Squared Error: 0.3268518575238732, KF: 10
Linear Regression Average Mean Squared Error: 0.28757273733529176

Random Forest Mean Squared Error: 0.10620709219858157, KF: 1
Random Forest Mean Squared Error: 0.09762021276595745, KF: 2
Random Forest Mean Squared Error: 0.09248120567375887, KF: 3
Random Forest Mean Squared Error: 0.10131103202846975, KF: 4
Random Forest Mean Squared Error: 0.0950395

In [83]:
from sklearn.ensemble import RandomForestRegressor
from joblib import dump

model_filename = 'randomForest.joblib'
dump(model, model_filename)

['randomForest.joblib']

In [84]:
def process_input(data):
    std = np.array([  5.78894742,   4.60483703, 146.83262808, 238.75573511,
         1.02875985,   1.41925462,  11.94819279,   1.        ,
         2.478264  ,   1.        ])
    mean = np.array([ 38.65624111,  84.24936025,  83.19277794, 645.22064259,
         2.11094987,   0.70699405,   9.38470287,   0.        ,
         0.75604208,   0.        ])
    
    data = (data - mean) / std
    df = pd.DataFrame([data])
    return df

In [85]:
df = process_input([50,70,5,300,5,4,3,0,2,0])
model.predict(df)



array([5.])