In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Preprocessing
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, StratifiedKFold
import ydata_profiling as pp

# Modele
from sklearn.linear_model import LinearRegression, SGDRegressor, RidgeCV
from sklearn.svm import SVR, LinearSVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor 
from sklearn.ensemble import BaggingRegressor, AdaBoostRegressor, VotingRegressor 
from sklearn.neural_network import MLPRegressor
from sklearn import linear_model
from sklearn.tree import DecisionTreeRegressor
from mlxtend.feature_selection import SequentialFeatureSelector
   
from sklearn.linear_model import Lasso
 
from sklearn.preprocessing import PolynomialFeatures
import sklearn.model_selection
from sklearn.model_selection import cross_val_predict as cvp
from sklearn import metrics
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import xgboost as xgb
import lightgbm as lgb
import seaborn as sns

# Optymalizacja modeli
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe, space_eval

import warnings
warnings.filterwarnings("ignore")

In [None]:
valid_part = 0.3
train0 = pd.read_csv('vehicles.csv')
train0.head(50)

In [None]:
drop_columns = ['url', 'region_url', 'region', 'manufacturer', 'model', 'lat', 'long', 'posting_date', 'state', 'county', 'size', 'description', 'title_status', 'VIN', 'image_url']
train0 = train0.drop(columns = drop_columns)
train0.info()

In [None]:
train0.info()

In [None]:
# Create a DataFrame summarizing df.info()
info_df = pd.DataFrame({
    "Column": train0.columns,
    "Non-Null Count": train0.notnull().sum(),
    "Dtype": train0.dtypes
}).reset_index(drop=True)


fig, ax = plt.subplots(figsize=(10, 5))  # Set appropriate size based on your DataFrame
ax.axis('tight')
ax.axis('off')
ax.table(cellText=info_df.values, colLabels=info_df.columns, cellLoc = 'center', loc='center')

plt.show()

In [None]:
train0 = train0.dropna()
train0.head(5)
train0.info()

In [None]:
train0['drive'].unique()

In [None]:
numerics = ['int8', 'int16', 'int32', 'int64', 'float16', 'float32', 'float64']
categorical_columns = []
features = train0.columns.values.tolist()
for col in features:
    if train0[col].dtype in numerics: continue
    categorical_columns.append(col)

for col in categorical_columns:
    if col in train0.columns:
        le = LabelEncoder()
        le.fit(list(train0[col].astype(str).values))
        train0[col] = le.transform(list(train0[col].astype(str).values))

In [None]:
train0['drive'].unique()
train0.head(10)
train0.info()

In [None]:
train0['year'] = (train0['year']-1900).astype(int)
train0['odometer'] = train0['odometer'].astype(int)
train0.head(10)

In [None]:
train0.info()

In [None]:
train0['price'].value_counts()

In [None]:
train0 = train0[train0['price'] > 1000]
train0 = train0[train0['price'] < 40000]
train0['odometer'] = train0['odometer'] // 5000
train0 = train0[train0['year'] > 110]

In [None]:
train0.corr()

In [None]:
train0.describe()

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(train0['price'], bins=30, kde=True)
plt.title('Rozkład cen samochodów i ich częstotliwości')
plt.xlabel('Cena (1000-40000)')
plt.ylabel('Częstotliwość')
plt.show()

plt.figure(figsize=(10, 6))
sns.histplot(train0['price'], bins=30, kde=True, log_scale=(False, True))  # False for x-axis, True for y-axis
plt.title('Log-Scaled Distribution of Vehicle Prices')
plt.xlabel('Price')
plt.ylabel('Log-Frequency')
plt.show()


# Boxplot of vehicle year
plt.figure(figsize=(10, 6))
sns.boxplot(x=train0['year'])
plt.title('Boxplot of Vehicle Years')
plt.xlabel('Year')
plt.show()

# Scatter plot of price vs odometer
plt.figure(figsize=(10, 6))
sns.scatterplot(x='odometer', y='price', data=train0)
plt.title('Price vs. Odometer Readings')
plt.xlabel('Odometer')
plt.ylabel('Price')
plt.show()

# Correlation heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(train0[['price', 'year', 'odometer']].corr(), annot=True, fmt=".2f", cmap='coolwarm', linewidths=.5, cbar_kws={"shrink": .8})
plt.title('Correlation Heatmap')
plt.show()

In [None]:
pp.ProfileReport(train0)

In [None]:
target_name = 'price'
train_target0 = train0[target_name]
train0 = train0.drop([target_name], axis=1)

In [None]:
train0, test0, train_target0, test_target0 = train_test_split(train0, train_target0, test_size=0.2, random_state=0)

In [None]:
train0b = train0
train_target0b = train_target0
trainb, testb, targetb, target_testb = train_test_split(train0b, train_target0b, test_size=valid_part, random_state=0)

In [None]:
scaler = StandardScaler()
train0 = pd.DataFrame(scaler.fit_transform(train0), columns = train0.columns)

In [None]:
train0.head(3)

In [None]:
len(train0)

In [None]:
train, test, target, target_test = train_test_split(train0, train_target0, test_size=valid_part, random_state=0)

In [None]:
acc_train_r2 = []
acc_test_r2 = []
acc_train_d = []
acc_test_d = []
acc_train_rmse = []
acc_test_rmse = []

In [None]:
def acc_d(y_meas, y_pred):
    return mean_absolute_error(y_meas, y_pred)*len(y_meas)/sum(abs(y_meas))

def acc_rmse(y_meas, y_pred):
    return (mean_squared_error(y_meas, y_pred))**0.5

In [None]:
def acc_boosting_model(num,model,train,test,num_iteration=0):
    
    global acc_train_r2, acc_test_r2, acc_train_d, acc_test_d, acc_train_rmse, acc_test_rmse
    
    if num_iteration > 0:
        ytrain = model.predict(train, num_iteration = num_iteration)  
        ytest = model.predict(test, num_iteration = num_iteration)
    else:
        ytrain = model.predict(train)  
        ytest = model.predict(test)

    print('target = ', targetb[:5].values)
    print('ytrain = ', ytrain[:5])

    acc_train_r2_num = round(r2_score(targetb, ytrain) * 100, 2)
    print('acc(r2_score) dla train =', acc_train_r2_num)   
    acc_train_r2.insert(num, acc_train_r2_num)

    acc_train_d_num = round(acc_d(targetb, ytrain) * 100, 2)
    print('acc(relative error) dla train =', acc_train_d_num)   
    acc_train_d.insert(num, acc_train_d_num)

    acc_train_rmse_num = round(acc_rmse(targetb, ytrain) * 100, 2)
    print('acc(rmse) dla train =', acc_train_rmse_num)   
    acc_train_rmse.insert(num, acc_train_rmse_num)

    print('target_test =', target_testb[:5].values)
    print('ytest =', ytest[:5])
    
    acc_test_r2_num = round(r2_score(target_testb, ytest) * 100, 2)
    print('acc(r2_score) dla test =', acc_test_r2_num)
    acc_test_r2.insert(num, acc_test_r2_num)
    
    acc_test_d_num = round(acc_d(target_testb, ytest) * 100, 2)
    print('acc(relative error) dla test =', acc_test_d_num)
    acc_test_d.insert(num, acc_test_d_num)
    
    acc_test_rmse_num = round(acc_rmse(target_testb, ytest) * 100, 2)
    print('acc(rmse) dla test =', acc_test_rmse_num)
    acc_test_rmse.insert(num, acc_test_rmse_num)

In [None]:
def acc_model(num,model,train,test):

    global acc_train_r2, acc_test_r2, acc_train_d, acc_test_d, acc_train_rmse, acc_test_rmse
    
    ytrain = model.predict(train)  
    ytest = model.predict(test)

    print('target = ', target[:5].values)
    print('ytrain = ', ytrain[:5])

    acc_train_r2_num = round(r2_score(target, ytrain) * 100, 2)
    print('acc(r2_score) dla train =', acc_train_r2_num)   
    acc_train_r2.insert(num, acc_train_r2_num)

    acc_train_d_num = round(acc_d(target, ytrain) * 100, 2)
    print('acc(relative error) dla train =', acc_train_d_num)   
    acc_train_d.insert(num, acc_train_d_num)

    acc_train_rmse_num = round(acc_rmse(target, ytrain) * 100, 2)
    print('acc(rmse) dla train =', acc_train_rmse_num)   
    acc_train_rmse.insert(num, acc_train_rmse_num)

    print('target_test =', target_test[:5].values)
    print('ytest =', ytest[:5])
    
    acc_test_r2_num = round(r2_score(target_test, ytest) * 100, 2)
    print('acc(r2_score) dla test =', acc_test_r2_num)
    acc_test_r2.insert(num, acc_test_r2_num)
    
    acc_test_d_num = round(acc_d(target_test, ytest) * 100, 2)
    print('acc(relative error) dla test =', acc_test_d_num)
    acc_test_d.insert(num, acc_test_d_num)
    
    acc_test_rmse_num = round(acc_rmse(target_test, ytest) * 100, 2)
    print('acc(rmse) dla test =', acc_test_rmse_num)
    acc_test_rmse.insert(num, acc_test_rmse_num)

## MODELE

In [None]:
linreg = LinearRegression()
linreg.fit(train, target)
acc_model(0,linreg,train,test)

In [None]:
lasso = Lasso(alpha=0.1)
lasso.fit(train, target)
acc_model(1,lasso,train,test)

In [None]:
svr = SVR()
svr.fit(train, target)
acc_model(2,svr,train,test)

In [None]:
linear_svr = LinearSVR()
linear_svr.fit(train, target)
acc_model(3,linear_svr,train,test)

In [None]:
mlp = MLPRegressor()
param_grid = {'hidden_layer_sizes': [i for i in range(2,20)],
              'activation': ['relu'],
              'solver': ['adam'],
              'learning_rate': ['constant'],
              'learning_rate_init': [0.01],
              'power_t': [0.5],
              'alpha': [0.0001],
              'max_iter': [1000],
              'early_stopping': [True],
              'warm_start': [False]}
mlp_GS = GridSearchCV(mlp, param_grid=param_grid, 
                   cv=10, verbose=True, pre_dispatch='2*n_jobs', n_jobs=-1)
mlp_GS.fit(train, target)
acc_model(4,mlp_GS,train,test)

In [None]:
sgd = SGDRegressor()
sgd.fit(train, target)
acc_model(5,sgd,train,test)

In [None]:
decision_tree = DecisionTreeRegressor()
decision_tree.fit(train, target)
acc_model(6,decision_tree,train,test)

In [None]:
random_forest = RandomForestRegressor()
random_forest.fit(train, target)
acc_model(7,random_forest,train,test)

In [None]:
xgb_clf = xgb.XGBRegressor(objective='reg:squarederror')
parameters = {'n_estimators': [60, 100, 120, 140], 
              'learning_rate': [0.01, 0.1],
              'max_depth': [5, 7],
              'reg_lambda': [0.5]}
xgb_reg = GridSearchCV(estimator=xgb_clf, param_grid=parameters, cv=5, n_jobs=-1).fit(trainb, targetb)
print("Best score: %0.3f" % xgb_reg.best_score_)
print("Best parameters set:", xgb_reg.best_params_)
acc_boosting_model(8,xgb_reg,trainb,testb)

In [None]:
Xtrain, Xval, Ztrain, Zval = train_test_split(trainb, targetb, test_size=0.2, random_state=0)
train_set = lgb.Dataset(Xtrain, Ztrain)
valid_set = lgb.Dataset(Xval, Zval)

In [None]:
# params = {
#         'boosting_type':'gbdt',
#         'objective': 'regression',
#         'num_leaves': 31,
#         'learning_rate': 0.01,
#         'max_depth': -1,
#         'subsample': 0.8,
#         'bagging_fraction' : 1,
#         'max_bin' : 5000 ,
#         'bagging_freq': 20,
#         'colsample_bytree': 0.6,
#         'metric': 'rmse',
#         'min_split_gain': 0.5,
#         'min_child_weight': 1,
#         'min_child_samples': 10,
#         'scale_pos_weight':1,
#         'zero_as_missing': False,
#         'seed':0,        
#     }
# modelL = lgb.train(params, train_set = train_set, num_boost_round=10000, valid_sets=valid_set)

In [None]:
# acc_boosting_model(9,modelL,trainb,testb,modelL.best_iteration)

In [None]:
fig =  plt.figure(figsize = (5,5))
axes = fig.add_subplot(111)
lgb.plot_importance(modelL,ax = axes,height = 0.5)
plt.show()
plt.close()

In [None]:
def hyperopt_gb_score(params):
    clf = GradientBoostingRegressor(**params)
    current_score = cross_val_score(clf, train, target, cv=10).mean()
    print(current_score, params)
    return current_score 
 
space_gb = {
            'n_estimators': hp.choice('n_estimators', range(100, 1000)),
            'max_depth': hp.choice('max_depth', np.arange(2, 10, dtype=int))            
        }
 
best = fmin(fn=hyperopt_gb_score, space=space_gb, algo=tpe.suggest, max_evals=10)
print('best:')
print(best)

In [None]:
params = space_eval(space_gb, best)
params

In [None]:
gradient_boosting = GradientBoostingRegressor(**params)
gradient_boosting.fit(train, target)
acc_model(10,gradient_boosting,train,test)

In [None]:
ridge = RidgeCV(cv=5)
ridge.fit(train, target)
acc_model(11,ridge,train,test)

In [None]:
bagging = BaggingRegressor()
bagging.fit(train, target)
acc_model(12,bagging,train,test)

In [None]:
etr = ExtraTreesRegressor()
etr.fit(train, target)
acc_model(13,etr,train,test)

In [None]:
Ada_Boost = AdaBoostRegressor()
Ada_Boost.fit(train, target)
acc_model(14,Ada_Boost,train,test)

In [None]:
Voting_Reg = VotingRegressor(estimators=[('lin', linreg), ('ridge', ridge), ('sgd', sgd)])
Voting_Reg.fit(train, target)
acc_model(15,Voting_Reg,train,test)

In [None]:
from sklearn.linear_model import ElasticNet
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(train, target)
acc_model(16,elastic,train,test)

## POROWNANIE

In [None]:
models = pd.DataFrame({
    'Model': ['Linear Regression', 'Lasso', 'Support Vector Machines', 'Linear SVR', 
              'MLPRegressor', 'Stochastic Gradient Decent', 
              'Decision Tree Regressor', 'Random Forest',  'XGB', 'LGBM',
              'GradientBoostingRegressor', 'RidgeRegressor', 'BaggingRegressor', 'ExtraTreesRegressor', 
              'AdaBoostRegressor', 'VotingRegressor', 'ElasticNet'],
    
    'r2_train': acc_train_r2,
    'r2_test': acc_test_r2,
    'd_train': acc_train_d,
    'd_test': acc_test_d,
    'rmse_train': acc_train_rmse,
    'rmse_test': acc_test_rmse
                     })

In [None]:
pd.options.display.float_format = '{:,.2f}'.format

In [None]:
print('Dokladnosc przewidywania R2 - r2_test')
models.sort_values(by=['r2_test', 'r2_train'], ascending=False)

In [None]:
print('Dokladnosc przewidywania relative error - d_test')
models.sort_values(by=['d_test', 'd_train'], ascending=True)

In [None]:
print('Dokladnosc przewidowyania RMSE - rmse_test')
models.sort_values(by=['rmse_test', 'rmse_train'], ascending=True)

In [None]:
plt.figure(figsize=[25,6])
xx = models['Model']
plt.tick_params(labelsize=14)
plt.plot(xx, models['r2_train'], label = 'r2_train')
plt.plot(xx, models['r2_test'], label = 'r2_test')
plt.legend()
plt.title('R2-dla modeli')
plt.xlabel('Modele')
plt.ylabel('R2, %')
plt.xticks(xx, rotation='vertical')
plt.savefig('graph.png')
plt.show()

In [None]:
plt.figure(figsize=[25,6])
xx = models['Model']
plt.tick_params(labelsize=14)
plt.plot(xx, models['d_train'], label = 'd_train')
plt.plot(xx, models['d_test'], label = 'd_test')
plt.legend()
plt.title('Relative error dla modeli')
plt.xlabel('Modele')
plt.ylabel('Relative error, %')
plt.xticks(xx, rotation='vertical')
plt.savefig('graph.png')
plt.show()

In [None]:
plt.figure(figsize=[25,6])
xx = models['Model']
plt.tick_params(labelsize=14)
plt.plot(xx, models['rmse_train'], label = 'rmse_train')
plt.plot(xx, models['rmse_test'], label = 'rmse_test')
plt.legend()
plt.title('RMSE dla modeli')
plt.xlabel('Modele')
plt.ylabel('RMSE, %')
plt.xticks(xx, rotation='vertical')
plt.savefig('graph.png')
plt.show()