In [24]:
import keras.models
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split as split
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from keras.models import Model
from keras.layers import Dense, Dropout, Lambda, BatchNormalization
from keras.utils.version_utils import callbacks
from matplotlib import pyplot as plt
from keras.callbacks import EarlyStopping
from keras import Sequential

df = pd.read_csv('preprocessed_v.csv')
df = df.drop(columns='Unnamed: 0')
df = df.replace(np.nan, 'null', regex=True)
prices = [
    (df['price'] <= 10000) & (df['price'] > 0),
    (df['price'] > 10000) & (df['price'] <= 20000),
    (df['price'] > 20000)
]
segments = [1, 2, 3]
df['segments'] = np.select(prices, segments)
encoded_data = df[df.columns]
encoded_data = encoded_data.dropna()

scalers = [StandardScaler(), StandardScaler()]

encoded_data['year'] = scalers[0].fit_transform(encoded_data['year'].values.reshape(-1, 1))
encoded_data['odometer'] = scalers[1].fit_transform(encoded_data['odometer'].values.reshape(-1, 1))

cols = ['manufacturer', 'fuel', 'condition', 'drive', 'cylinders', 'transmission', 'type', 'paint_color', 'model']

encoded_data = pd.get_dummies(encoded_data, dummy_na=True,
                              columns=['manufacturer', 'fuel', 'condition', 'drive', 'cylinders', 'transmission',
                                       'type', 'paint_color'])
encoders = []

encoded_data['model'] = LabelEncoder().fit_transform(encoded_data['model'])
# from sklearn.preprocessing import LabelEncoder
#
# for column in cols:
#     encoded_data[column] = LabelEncoder().fit_transform(encoded_data[column])
# encoded_data
df_for_1_model = encoded_data.copy()
X_train_classify = encoded_data.drop(columns=['price', 'segments'], axis=1)
y_train_classify = encoded_data['segments']
X_train_classify, X_test_classify, y_train_classify, y_test_classify = split(X_train_classify, y_train_classify,
                                                                             train_size=0.8, random_state=4222)
y_test_classify.shape
X_train_segmented = encoded_data.drop(columns=['price'], axis=1)
y_train_segmented = encoded_data['price']
X_train_segmented, X_test_segmented, y_train_segmented, y_test_segmented = split(X_train_segmented, y_train_segmented,
                                                                                 train_size=0.8, random_state=4222)
y_test_segmented.shape

(24488,)

In [25]:
accuracy_df = pd.DataFrame(columns=('r2', 'rmse', 'mae'))
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import mean_squared_error
from sklearn import metrics
model = DecisionTreeClassifier(criterion='entropy')
model.fit(X_train_classify, y_train_classify)

y_pred_cl = model.predict(X_test_classify)

r2 = round(metrics.r2_score(y_test_classify, y_pred_cl), 2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test_classify, y_pred_cl)), 2)
mae = round(metrics.mean_absolute_error(y_test_classify, y_pred_cl), 2)
accuracy_df = accuracy_df.append(
    pd.DataFrame({'r2': [r2], 'rmse': [rmse], 'mae': [mae]}, index=['Decision Tree (entropy) Classification']))
from sklearn.tree import DecisionTreeClassifier

model = DecisionTreeClassifier(criterion='gini')
model.fit(X_train_classify, y_train_classify)

y_pred_cl = model.predict(X_test_classify)

r2 = round(metrics.r2_score(y_test_classify, y_pred_cl), 2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test_classify, y_pred_cl)), 2)
mae = round(metrics.mean_absolute_error(y_test_classify, y_pred_cl), 2)
accuracy_df = accuracy_df.append(
    pd.DataFrame({'r2': [r2], 'rmse': [rmse], 'mae': [mae]}, index=['Decision Tree (gini) Classification']))
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier()
model.fit(X_train_classify, y_train_classify)

y_pred = model.predict(X_test_classify)

r2 = round(metrics.r2_score(y_test_classify, y_pred), 2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test_classify, y_pred)), 2)
mae = round(metrics.mean_absolute_error(y_test_classify, y_pred), 2)
accuracy_df = accuracy_df.append(
    pd.DataFrame({'r2': [r2], 'rmse': [rmse], 'mae': [mae]}, index=['Random Forest Classification']))
real_vs_predicted_segments = pd.DataFrame(
    {"y": y_test_classify, "y_pred": y_pred_cl})  # порівнянння реальних даних з передбаченням моделі
real_vs_predicted_segments

Unnamed: 0,y,y_pred
111913,1,1
113030,3,3
51949,1,1
4159,3,3
65491,3,3
...,...,...
122252,3,3
73909,1,1
119140,2,2
20371,3,3


In [26]:
accuracy_df

Unnamed: 0,r2,rmse,mae
Decision Tree (entropy) Classification,0.84,0.34,0.11
Decision Tree (gini) Classification,0.83,0.34,0.11
Random Forest Classification,0.88,0.29,0.08


In [29]:
from sklearn.ensemble import RandomForestRegressor

random_forest_segmented = pd.DataFrame(columns=('r2', 'rmse', 'mae'))

X_test_segmented['segments'] = y_pred
for i in range(1, 51, 10):
    rf = RandomForestRegressor(n_estimators=i, criterion="squared_error")
    rf.fit(X_train_segmented, y_train_segmented)
    y_pred = rf.predict(X_test_segmented)
    r2 = round(metrics.r2_score(y_test_segmented, y_pred), 2)
    rmse = round(np.sqrt(metrics.mean_squared_error(y_test_segmented, y_pred)), 2)
    mae = round(metrics.mean_absolute_error(y_test_segmented, y_pred), 2)
    random_forest_segmented = random_forest_segmented.append(
        pd.DataFrame({'r2': [r2], 'rmse': [rmse], 'mae': [mae]}, index=['Random Forest REG n = ' + str(i)]))
random_forest_segmented

Unnamed: 0,r2,rmse,mae
Random Forest REG n = 1,0.89,4053.11,1945.43
Random Forest REG n = 11,0.92,3423.37,1652.88
Random Forest REG n = 21,0.92,3396.1,1634.05
Random Forest REG n = 31,0.92,3388.33,1621.92
Random Forest REG n = 41,0.92,3382.52,1621.78


In [30]:
from sklearn.tree import DecisionTreeClassifier
combined_res = pd.DataFrame(columns=('r2', 'rmse', 'mae'))
dt = DecisionTreeClassifier(criterion="gini", max_depth=41)
dt.fit(X_train_classify, y_train_classify)
# y_pred = dt.predict(X_train_classify)
y_pred_test = dt.predict(X_test_classify)
r2 = round(metrics.r2_score(y_test_classify, y_pred_test), 2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test_classify, y_pred_test)), 2)
mae = round(metrics.mean_absolute_error(y_test_classify, y_pred_test), 2)
combined_res = combined_res.append(
    pd.DataFrame({'r2': [r2], 'rmse': [rmse], 'mae': [mae]}, index=['Decision Tree Class']))


from sklearn.ensemble import RandomForestRegressor
# X_train_segmented['segments'] = y_pred
X_test_segmented['segments'] = y_pred_test
# rf = RandomForestRegressor(n_estimators=41, criterion="squared_error")
# rf.fit(X_train_segmented, y_train_segmented)
y_pred = rf.predict(X_test_segmented)
r2 = round(metrics.r2_score(y_test_segmented, y_pred), 2)
rmse = round(np.sqrt(metrics.mean_squared_error(y_test_segmented, y_pred)), 2)
mae = round(metrics.mean_absolute_error(y_test_segmented, y_pred), 2)
combined_res = combined_res.append(
    pd.DataFrame({'r2': [r2], 'rmse': [rmse], 'mae': [mae]}, index=['Random Forest REG']))
combined_res

Unnamed: 0,r2,rmse,mae
Decision Tree Class,0.83,0.34,0.11
Random Forest REG,0.92,3397.3,1628.24
