# 📚Loading libraries📚

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_log_error
from sklearn.ensemble import RandomForestRegressor

In [None]:
train = pd.read_csv('../input/machine-hack/train.csv')
test = pd.read_csv('../input/machine-hack/test.csv')
sample_submission = pd.read_csv('../input/machine-hack/submission.csv')

In [None]:
train.head()

In [None]:
print(f'Train shape: {train.shape}')
print(f'Test shape: {test.shape}')

In [None]:
train.describe().T

In [None]:
train.info()

# EDA📊

In [None]:
f = plt.figure(figsize=(15, 8))

ax = f.add_subplot(121)
stats.probplot(train['Price'], plot=ax)
ax.set_title('Price prob distribution')

ax = f.add_subplot(122)
plt.hist(train['Price'])

plt.show()

There is one outlier in the data

In [None]:
f, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(train.corr(), annot=True, linewidths=0.5, fmt='0.3f', ax=ax)
plt.show()

# Pre-processing ⚙🛠

In [None]:
train = train[train['Price'] < 600000].reset_index()
train = train[train['Price'] > 10].reset_index()

In [None]:
def prepare_df(df, levy_median = 781):
    df['Turbo'] = df['Engine volume'].str.contains(' Turbo').astype(int) # Adding extra feature
    
    df['Leather interior'] = (df['Leather interior'] == 'Yes').astype(int) #category to numeric
    
    df['Levy'] = df['Levy'].replace({'-': np.nan}).astype(float)
    df['Levy'] = df['Levy'].fillna(levy_median) #replace with median
    
    df['Mileage'] = df['Mileage'].str.replace('km','').astype(float) # Separating text
    
    df['Engine volume'] = df['Engine volume'].str.replace(' Turbo', '').astype(float)
    
    df = df.drop(['Manufacturer', 'Model','Doors'], axis=1)
    
    return df

In [None]:
train_prepared = prepare_df(train)
test_prepared = prepare_df(test)

In [None]:
f, ax = plt.subplots(figsize=(8, 8))
sns.heatmap(train.corr(), annot=True, linewidths=0.5, fmt='0.3f', ax=ax)
plt.show()

📌 Engine vol. and Cylinders are highly correlated which is expected.

📌 Levy is also correleted with EV and Cylinders.

📌 There is no direct correlation with target 'Price' variable.


# Feature engineering

In [None]:
train_prepared_numeric = train_prepared.select_dtypes(np.number)
train_prepared_categorical = train_prepared.select_dtypes(object)

In [None]:
encoder = OrdinalEncoder()
train_cat_encoded = encoder.fit_transform(train_prepared_categorical)

In [None]:
cat_df=pd.DataFrame(train_cat_encoded, columns=train_prepared_categorical.columns)
cat_df.head()

In [None]:
concat=pd.concat([train_prepared_numeric,cat_df],axis=1)

In [None]:
X = concat.drop(['Price','level_0','index'], axis=1).copy()
y = concat['Price'].values

In [None]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.3, random_state=3)

# Model

In [None]:
model = RandomForestRegressor(random_state=3)

In [None]:
model.fit(X_train, y_train)
print(f'Train RMSLE: {np.sqrt(mean_squared_log_error(model.predict(X_train), y_train))}')
print(f'Val RMSLE: {np.sqrt(mean_squared_log_error(model.predict(X_val), y_val))}')

In [None]:
test_prep_num = test_prepared.select_dtypes(np.number)
test_cat = test_prepared.select_dtypes(object)

In [None]:
test_enc_cat = encoder.transform(test_cat)

In [None]:
test_cat_df=pd.DataFrame(test_enc_cat, columns=test_cat.columns)
test_cat_df.head()

In [None]:
concat_test = pd.concat([test_prep_num, test_cat_df], axis=1)

In [None]:
X_test = concat_test.drop(['Price'], axis=1).copy()

In [None]:
sample_submission['Price'] = model.predict(X_test)

In [None]:
sample_submission.head()

In [None]:
sample_submission.to_csv('submission.csv', index=None)