In [3]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/summer-school-2022-task-2/train_data.csv
/kaggle/input/summer-school-2022-task-2/test_data.csv
/kaggle/input/summer-school-2022-task-2/sample_submssions.csv


In [4]:
train = pd.read_csv('/kaggle/input/summer-school-2022-task-2/train_data.csv')
test = pd.read_csv('/kaggle/input/summer-school-2022-task-2/test_data.csv')

In [5]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

In [6]:
train = reduce_mem_usage(train)
test = reduce_mem_usage(test)

Memory usage of dataframe is 25.48 MB
Memory usage after optimization is: 6.18 MB
Decreased by 75.7%
Memory usage of dataframe is 5.97 MB
Memory usage after optimization is: 1.36 MB
Decreased by 77.3%


In [7]:
train = train.sample(50000, random_state = 1)

In [8]:
!pip install pycaret --ignore-installed llvmlite

Collecting pycaret
  Downloading pycaret-2.3.10-py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m320.2/320.2 kB[0m [31m500.2 kB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting llvmlite
  Downloading llvmlite-0.39.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m34.6/34.6 MB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting plotly>=4.4.1
  Downloading plotly-5.10.0-py2.py3-none-any.whl (15.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.2/15.2 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting scikit-learn==0.23.2
  Downloading scikit_learn-0.23.2-cp37-cp37m-manylinux1_x86_64.whl (6.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m15.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hCollecting mlxtend>=0.17.0
  Downloadi

In [10]:
from pycaret.regression import *

In [11]:
train.columns

Index(['id', 'seller', 'offer_type', 'price', 'abtest', 'vehicle_type',
       'year_of_registration', 'gearbox', 'power', 'model', 'kilometer',
       'month_of_registration', 'fuel_type', 'brand', 'not_repaired_damage',
       'postal_code'],
      dtype='object')

In [12]:
print(train.shape)

(50000, 16)


In [13]:
reg = setup(data = train, train_size = 0.9, target = 'price',
              session_id = 100, 
              normalize = True,
              transformation = True, 
              remove_multicollinearity = True,
              multicollinearity_threshold = 0.85,
              remove_outliers = True,
              handle_unknown_categorical = True,
              ignore_low_variance = True,
              create_clusters = True,
              feature_selection = True,
              feature_selection_threshold = 0.7,
              log_experiment = True,
              fold = 5,
              n_jobs = -1,
              use_gpu = True,
              silent = True,
              ignore_features = ['id', 'postal_code'])

Unnamed: 0,Description,Value
0,session_id,100
1,Target,price
2,Original Data,"(50000, 16)"
3,Missing Values,False
4,Numeric Features,4
5,Categorical Features,9
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(42750, 204)"


2022/08/21 05:48:10 INFO mlflow.tracking.fluent: Experiment with name 'reg-default-name' does not exist. Creating a new experiment.


PicklingError: Can't pickle <class 'pyod.models.pca.PCA'>: it's not found as pyod.models.pca.PCA

In [14]:
best = create_model('huber')

Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,5863.9139,23405549539.3562,152988.7236,0.0019,1.3532,8.5056
1,3480.7588,56291918.7656,7502.7941,0.347,1.3411,6.0089
2,3386.6987,39986944.746,6323.5231,0.408,1.3523,8.7457
3,15009.1308,1146212521258.9363,1070613.152,-0.0001,1.3649,6.4503
4,3669.5954,129864250.9105,11395.7997,0.2129,1.3423,5.7551
Mean,6282.0195,233968842782.5429,249764.7985,0.1939,1.3508,7.0931
Std,4458.5197,456211330045.069,414229.8736,0.1698,0.0086,1.2732


In [15]:
print(best)

HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
               tol=1e-05, warm_start=False)


In [16]:
evaluate_model(best)

Unnamed: 0,Parameters
alpha,0.0001
epsilon,1.35
fit_intercept,True
max_iter,100
tol,1e-05
warm_start,False


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [17]:
predict_model(best)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Huber Regressor,3604.7613,114556684.3609,10703.1156,0.2409,1.3495,3.7425


Unnamed: 0,model_tiguan,model_vectra,model_octavia,data_cluster_10,model_micra,brand_seat,model_q5,model_4_reihe,model_accord,model_twingo,...,brand_lancia,model_911,model_a3,model_lupo,model_c2,brand_saab,model_m_reihe,model_galaxy,price,Label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,19900,9701.617199
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,16500,14554.289870
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2999,2732.732961
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1600,895.351797
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,23700,8446.318234
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4000,6516.325147
4996,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,7800,5663.874050
4997,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,550,1204.052411
4998,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1750,197.039166


In [18]:
finalize_model(best) # trains the model on entire train set including val set

HuberRegressor(alpha=0.0001, epsilon=1.35, fit_intercept=True, max_iter=100,
               tol=1e-05, warm_start=False)

In [21]:
save_model(best, 'best_model_pycaret_2_huber')

PicklingError: Can't pickle <class 'pyod.models.pca.PCA'>: it's not found as pyod.models.pca.PCA

In [22]:
save_config('my_config_pycaret1_car_dataset')

PicklingError: Can't pickle <class 'pyod.models.pca.PCA'>: it's not found as pyod.models.pca.PCA

In [23]:
preds = predict_model(best,data = test)

In [24]:
preds

Unnamed: 0,id,seller,offer_type,abtest,vehicle_type,year_of_registration,gearbox,power,model,kilometer,month_of_registration,fuel_type,brand,not_repaired_damage,postal_code,Label
0,0,privat,Angebot,control,kleinwagen,2000,manuell,75,fiesta,150000,3,benzin,ford,nein,54294,1235.712824
1,1,privat,Angebot,test,kombi,1995,automatik,0,e_klasse,150000,3,benzin,mercedes_benz,ja,21220,-2540.848579
2,2,privat,Angebot,control,kleinwagen,1998,manuell,54,corsa,125000,8,benzin,opel,nein,40233,2006.835421
3,3,privat,Angebot,control,bus,2009,manuell,163,sprinter,125000,4,diesel,mercedes_benz,nein,93179,7511.743467
4,4,privat,Angebot,test,kombi,2007,manuell,87,logan,150000,5,benzin,dacia,nein,56862,1867.074459
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52187,52187,privat,Angebot,test,limousine,2009,manuell,101,focus,60000,12,benzin,ford,nein,30625,9166.648209
52188,52188,privat,Angebot,test,limousine,1991,manuell,75,e_klasse,150000,9,diesel,mercedes_benz,nein,26723,1199.688779
52189,52189,privat,Angebot,test,cabrio,2000,automatik,54,fortwo,150000,5,benzin,smart,nein,56729,185.006339
52190,52190,privat,Angebot,test,limousine,2014,automatik,230,golf,30000,8,benzin,volkswagen,nein,31303,17843.386499


In [25]:
submission = pd.DataFrame()
submission['id'] = test['id']
submission['price'] = preds['Label']
submission.to_csv("submission.csv", index = False)