In [0]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

In [0]:
def load_data():
  train_df = pd.read_csv('train.csv')
  test_df = pd.read_csv('test.csv')
  return train_df, test_df

# 1.Load Data

In [0]:
train_df, test_df = load_data()

In [0]:
carkaidee = pd.concat([train_df, test_df], axis=0)

In [0]:
carkaidee.shape

## train_df data wrangling

In [0]:
carkaidee.isnull().sum()

In [0]:
carkaidee.info()

## convert all non numeric in mileage

In [0]:
carkaidee['mileage'] = pd.to_numeric(carkaidee.mileage, errors = 'coerce')
carkaidee.info()

In [0]:
carkaidee.describe().round(3)

## check outlier in price and mileage

In [0]:
carkaidee.boxplot()

In [0]:
carkaidee = carkaidee.drop(['desc','ad_id','timestamp'], axis=1)

In [0]:
carkaidee.boxplot(column=['price', 'mileage', 'year'])

In [0]:
carkaidee = carkaidee[(carkaidee.price > carkaidee.price.quantile(0.001)) & (carkaidee.price < carkaidee.price.quantile(0.96))]

In [0]:
# carkaidee.boxplot(column=['price', 'mileage', 'year'])
carkaidee.boxplot()

### clean out over 300,000 km mileage

In [0]:
carkaidee.mileage[carkaidee.mileage>300000] = np.nan

In [0]:
carkaidee.boxplot()

In [0]:
sns.violinplot(x= 'year', orient = 'v', data= carkaidee )

In [0]:
carkaidee.year[carkaidee.year<1990] = np.nan

## fill missing data in carkaidee

In [0]:
def clean_data(df):
  for col in df.columns:
    mode = df[col].mode()[0]
    df[col] = df[col].fillna(mode)
  return df

In [0]:
clean = clean_data(carkaidee.copy())

# category data set

In [0]:
clean.car_type = clean.car_type.astype('category')
clean.transmission = clean.transmission.astype('category')
clean.fuel = clean.fuel.astype('category')

In [0]:
clean.boxplot('mileage')

In [0]:
display(clean.sample(5))

# Add car_age dataset

In [0]:
clean['car_age'] = 2020 - clean.year

In [0]:
clean.boxplot('car_age')

In [0]:
sns.violinplot(x= 'car_age', orient = 'v', data= clean )

In [0]:
sns.violinplot(x= 'mileage', orient = 'v', data= clean )

In [0]:
clean.loc[clean['car_age']>50].sort_values(by='car_age', ascending = False)

In [0]:
clean.info()

# 3.Extract feature

In [0]:
clean.head()

In [0]:
dummy = pd.get_dummies(clean[['location','brand','model', 'color','fuel', 'transmission', 'car_type']])
clean1 = pd.concat([clean.copy(), dummy], axis=1)
clean1 = clean1._get_numeric_data()

In [0]:
clean1.head()

In [0]:
clean1.columns

In [0]:
clean1.location_กรุงเทพมหานคร

In [0]:
def split_data_price(df):
  y = df['price']
  X = df.drop('price', axis=1)
  return X, y

In [0]:
X, y = split_data_price(clean1.copy())

In [0]:
print(X.shape)
X.head()

In [0]:
print(X.shape)
y.head()

## create train and test set

In [0]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size= 0.2, random_state = 42)

# 4.Train model (Regression)

In [0]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [0]:
forest = RandomForestRegressor()
forest.fit(X_train, y_train)

# Evaluation (Regression) with mean squar error ; mse

In [0]:
y_pred = forest.predict(X_test)

In [0]:
print('R2 :', forest.score(X_test, y_test))

In [0]:
forest_mse = mean_squared_error(y_test, y_pred)
print('Random forest model MSE  :', forest_mse)

## Root mean sqaur error

In [0]:
import math
print('RSME :', math.sqrt(forest_mse))