In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import scipy.stats as stats
import pyforest
from sklearn.preprocessing import (
    StandardScaler,
    PolynomialFeatures,
    OneHotEncoder,
    StandardScaler,
    PowerTransformer,
    MinMaxScaler,
    LabelEncoder,
    RobustScaler,
)
from sklearn.model_selection import (
    RepeatedStratifiedKFold,
    KFold,
    cross_val_predict,
    train_test_split,
    GridSearchCV,
    cross_val_score,
    cross_validate,
)
from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet
from sklearn.metrics import (
    plot_confusion_matrix,
    r2_score,
    mean_absolute_error,
    mean_squared_error,
    classification_report,
    confusion_matrix,
    accuracy_score,
    classification_report,
)
from sklearn.metrics import (
    make_scorer,
    precision_score,
    precision_recall_curve,
    plot_precision_recall_curve,
    plot_roc_curve,
    roc_auc_score,
    roc_curve,
    f1_score,
    accuracy_score,
    recall_score,
)
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
from sklearn.ensemble import (
    RandomForestRegressor,
    RandomForestClassifier,
    GradientBoostingRegressor,
    ExtraTreesRegressor,
    AdaBoostClassifier,
)
from sklearn.feature_selection import (
    SelectKBest,
    SelectPercentile,
    f_classif,
    f_regression,
    mutual_info_regression,
)
from xgboost import XGBRegressor, XGBClassifier
from xgboost import plot_importance
from sklearn.pipeline import Pipeline
from sklearn.tree import plot_tree
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

# importing plotly and cufflinks in offline mode
import cufflinks as cf
import plotly.offline

cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

import warnings

warnings.filterwarnings("ignore")
warnings.warn("this will not show")
plt.rcParams["figure.figsize"] = (10, 6)
pd.set_option("max_colwidth", 200)

# pd.set_option('display.max_rows', 100) # if you wish to see more rows rather than default, just uncomment this line.
pd.set_option("display.max_columns", 200)
pd.set_option("display.max_rows", 200)
pd.set_option("display.float_format", lambda x: "%.3f" % x)

import colorama
from colorama import Fore, Style  # maakes strings colored

# !pip3 install termcolor
from termcolor import colored

In [2]:
df = pd.read_csv("boat_data.csv", encoding="UTF-8")

In [3]:
df.head(20)

Unnamed: 0,Price,Boat Type,Manufacturer,Type,Year Built,Length,Width,Material,Location,Number of views last 7 days
0,CHF 3337,Motor Yacht,Rigiflex power boats,new boat from stock,2017,4.0,1.9,,Switzerland Â» Lake Geneva Â» VÃ©senaz,226
1,EUR 3490,Center console boat,Terhi power boats,new boat from stock,2020,4.0,1.5,Thermoplastic,Germany Â» BÃ¶nningstedt,75
2,CHF 3770,Sport Boat,Marine power boats,new boat from stock,0,3.69,1.42,Aluminium,Switzerland Â» Lake of Zurich Â» StÃ¤fa ZH,124
3,DKK 25900,Sport Boat,Pioner power boats,new boat from stock,2020,3.0,1.0,,Denmark Â» Svendborg,64
4,EUR 3399,Fishing Boat,Linder power boats,new boat from stock,2019,3.55,1.46,Aluminium,Germany Â» Bayern Â» MÃ¼nchen,58
5,CHF 3650,Sport Boat,Linder power boats,new boat from stock,0,4.03,1.56,Aluminium,Switzerland Â» Lake Constance Â» Uttwil,132
6,CHF 3600,Catamaran,,"Used boat,Unleaded",1999,6.2,2.38,Aluminium,Switzerland Â» Neuenburgersee Â» Yvonand,474
7,DKK 24800,Sport Boat,,Used boat,0,3.0,,,Denmark Â» Svendborg,134
8,EUR 3333,Fishing Boat,Crescent power boats,new boat from stock,2019,3.64,1.37,,Germany Â» Bayern Â» Boote+service Oberbayern,45
9,EUR 3300,Pontoon Boat,Whaly power boats,new boat from stock,2018,4.35,1.73,,Italy Â» Dormelletto,180


In [4]:
df.shape

(9888, 10)

In [5]:
df.isna().sum()

Price                             0
Boat Type                         0
Manufacturer                   1338
Type                              6
Year Built                        0
Length                            9
Width                            56
Material                       1749
Location                         36
Number of views last 7 days       0
dtype: int64

In [6]:
df[["Manufacturer", "Material", "Location", "Type"]] = df[
    ["Manufacturer", "Material", "Location", "Type"]
].fillna("Other")
df[["Width", "Length"]] = df[["Width", "Length"]].fillna(0)

In [7]:
df["Currency"] = df["Price"].str[:3]

In [8]:
df["Currency"].value_counts()

EUR    8430
CHF     980
Â£      298
DKK     180
Name: Currency, dtype: int64

In [9]:
df["Price"] = df["Price"].str[3:]
df["Price"] = df["Price"].astype(np.int32)

In [10]:
currency_exchange = {"EUR": 1, "CHF": 0.96, "Â£ ": 1.17, "DKK": 0.13}

In [11]:
df["EUR Price"] = df.apply(
    lambda x: x["Price"] * currency_exchange.get(x["Currency"]), axis=1
)

In [12]:
df.dtypes

Price                            int32
Boat Type                       object
Manufacturer                    object
Type                            object
Year Built                       int64
Length                         float64
Width                          float64
Material                        object
Location                        object
Number of views last 7 days      int64
Currency                        object
EUR Price                      float64
dtype: object

In [13]:
df.head(30)

Unnamed: 0,Price,Boat Type,Manufacturer,Type,Year Built,Length,Width,Material,Location,Number of views last 7 days,Currency,EUR Price
0,3337,Motor Yacht,Rigiflex power boats,new boat from stock,2017,4.0,1.9,Other,Switzerland Â» Lake Geneva Â» VÃ©senaz,226,CHF,3203.52
1,3490,Center console boat,Terhi power boats,new boat from stock,2020,4.0,1.5,Thermoplastic,Germany Â» BÃ¶nningstedt,75,EUR,3490.0
2,3770,Sport Boat,Marine power boats,new boat from stock,0,3.69,1.42,Aluminium,Switzerland Â» Lake of Zurich Â» StÃ¤fa ZH,124,CHF,3619.2
3,25900,Sport Boat,Pioner power boats,new boat from stock,2020,3.0,1.0,Other,Denmark Â» Svendborg,64,DKK,3367.0
4,3399,Fishing Boat,Linder power boats,new boat from stock,2019,3.55,1.46,Aluminium,Germany Â» Bayern Â» MÃ¼nchen,58,EUR,3399.0
5,3650,Sport Boat,Linder power boats,new boat from stock,0,4.03,1.56,Aluminium,Switzerland Â» Lake Constance Â» Uttwil,132,CHF,3504.0
6,3600,Catamaran,Other,"Used boat,Unleaded",1999,6.2,2.38,Aluminium,Switzerland Â» Neuenburgersee Â» Yvonand,474,CHF,3456.0
7,24800,Sport Boat,Other,Used boat,0,3.0,0.0,Other,Denmark Â» Svendborg,134,DKK,3224.0
8,3333,Fishing Boat,Crescent power boats,new boat from stock,2019,3.64,1.37,Other,Germany Â» Bayern Â» Boote+service Oberbayern,45,EUR,3333.0
9,3300,Pontoon Boat,Whaly power boats,new boat from stock,2018,4.35,1.73,Other,Italy Â» Dormelletto,180,EUR,3300.0


In [14]:
df["Country"] = df["Location"].str.split(" ", 1, expand=True)[0]

In [15]:
df.head()

Unnamed: 0,Price,Boat Type,Manufacturer,Type,Year Built,Length,Width,Material,Location,Number of views last 7 days,Currency,EUR Price,Country
0,3337,Motor Yacht,Rigiflex power boats,new boat from stock,2017,4.0,1.9,Other,Switzerland Â» Lake Geneva Â» VÃ©senaz,226,CHF,3203.52,Switzerland
1,3490,Center console boat,Terhi power boats,new boat from stock,2020,4.0,1.5,Thermoplastic,Germany Â» BÃ¶nningstedt,75,EUR,3490.0,Germany
2,3770,Sport Boat,Marine power boats,new boat from stock,0,3.69,1.42,Aluminium,Switzerland Â» Lake of Zurich Â» StÃ¤fa ZH,124,CHF,3619.2,Switzerland
3,25900,Sport Boat,Pioner power boats,new boat from stock,2020,3.0,1.0,Other,Denmark Â» Svendborg,64,DKK,3367.0,Denmark
4,3399,Fishing Boat,Linder power boats,new boat from stock,2019,3.55,1.46,Aluminium,Germany Â» Bayern Â» MÃ¼nchen,58,EUR,3399.0,Germany


In [16]:
df.describe()

Unnamed: 0,Price,Year Built,Length,Width,Number of views last 7 days,EUR Price
count,9888.0,9888.0,9888.0,9888.0,9888.0,9888.0
mean,320137.342,1893.193,11.559,3.5,149.161,302592.41
std,1007482.237,460.202,6.01,1.245,151.82,971361.018
min,3300.0,0.0,0.0,0.0,13.0,3203.52
25%,44000.0,1996.0,7.46,2.54,70.0,42900.0
50%,95000.0,2007.0,10.26,3.32,108.0,91942.0
75%,255000.0,2017.0,13.92,4.25,172.0,247568.28
max,31000000.0,2021.0,100.0,25.16,3263.0,31000000.0


In [17]:
df["Type"].value_counts()

Used boat,Diesel                4140
Used boat,Unleaded              1686
Used boat                       1462
new boat from stock,Unleaded    1107
new boat from stock              665
new boat from stock,Diesel       291
new boat on order,Unleaded       150
Display Model,Unleaded            75
new boat on order                 61
new boat on order,Diesel          61
Diesel                            57
Used boat,Electric                27
Unleaded                          22
Display Model,Diesel              19
new boat from stock,Electric      18
Display Model                     18
Used boat,Gas                     10
Other                              6
Display Model,Electric             6
new boat from stock,Gas            2
new boat from stock,Hybrid         1
Used boat,Hybrid                   1
Display Model,Gas                  1
Electric                           1
Used boat,Propane                  1
Name: Type, dtype: int64

In [18]:
type_set = set()

In [19]:
for cell in df["Type"]:
    for split in cell.split(","):
        type_set.add(split)

In [20]:
type_set

{'Diesel',
 'Display Model',
 'Electric',
 'Gas',
 'Hybrid',
 'Other',
 'Propane',
 'Unleaded',
 'Used boat',
 'new boat from stock',
 'new boat on order'}

In [21]:
condition_set = {'Display Model','Used boat','new boat from stock','new boat on order'}
fuel_type_set = {'Diesel','Electric','Gas','Hybrid','Propane','Unleaded'}

In [22]:
new_arr = []
for cell in df["Type"]:
    condition_type = fuel_type = "Other"
    for split in cell.split(","):
        if split in condition_set:
            condition_type = split
        elif split in fuel_type_set:
            fuel_type = split
        new_arr.append([condition_type, fuel_type])

In [23]:
df[["Condition","Fuel Type"]] = pd.DataFrame(new_arr)

In [24]:
df

Unnamed: 0,Price,Boat Type,Manufacturer,Type,Year Built,Length,Width,Material,Location,Number of views last 7 days,Currency,EUR Price,Country,Condition,Fuel Type
0,3337,Motor Yacht,Rigiflex power boats,new boat from stock,2017,4.000,1.900,Other,Switzerland Â» Lake Geneva Â» VÃ©senaz,226,CHF,3203.520,Switzerland,new boat from stock,Other
1,3490,Center console boat,Terhi power boats,new boat from stock,2020,4.000,1.500,Thermoplastic,Germany Â» BÃ¶nningstedt,75,EUR,3490.000,Germany,new boat from stock,Other
2,3770,Sport Boat,Marine power boats,new boat from stock,0,3.690,1.420,Aluminium,Switzerland Â» Lake of Zurich Â» StÃ¤fa ZH,124,CHF,3619.200,Switzerland,new boat from stock,Other
3,25900,Sport Boat,Pioner power boats,new boat from stock,2020,3.000,1.000,Other,Denmark Â» Svendborg,64,DKK,3367.000,Denmark,new boat from stock,Other
4,3399,Fishing Boat,Linder power boats,new boat from stock,2019,3.550,1.460,Aluminium,Germany Â» Bayern Â» MÃ¼nchen,58,EUR,3399.000,Germany,new boat from stock,Other
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9883,4900,Sport Boat,Sea Ray power boats,"Used boat,Unleaded",1987,6.300,2.440,Other,Switzerland Â» Lago Maggiore Â» Riazzino,1116,CHF,4704.000,Switzerland,new boat on order,Diesel
9884,4516,Sport Boat,Other,new boat from stock,0,4.170,1.680,GRP,Germany Â» Hamburg Â» HAMBURG,94,EUR,4516.000,Germany,Used boat,Other
9885,4499,Sport Boat,BlueCraft power boats,"new boat from stock,Unleaded",2020,4.400,1.800,GRP,Germany Â» Nordrhein-Westfalen Â» Wesel,354,EUR,4499.000,Germany,Used boat,Diesel
9886,4300,Pontoon Boat,Whaly power boats,new boat from stock,2018,4.370,1.890,Other,Italy Â» Dormelletto,266,EUR,4300.000,Italy,new boat from stock,Other
