In [523]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction import FeatureHasher

df = pd.read_csv('./content/results/cars_v1.csv')
df_cols = ['name', 'model', 'year', 'color', 'fuelType', 'carOrigin', 'carInsurance', 'gearType', 'mirrorType', 'motorPower', 'drivenKm', 'passengers', 'paymentMethod', 'saleType', 'secondHandStatus','price']
numeric_cols = ['year', 'motorPower', 'drivenKm', 'passengers', 'secondHandStatus','price']
cats_min = ['fuelType', 'carOrigin', 'carInsurance', 'gearType', 'mirrorType', 'paymentMethod', 'saleType']
cats_top = ['name', 'model', 'color']

In [524]:
## Feature Engineering & Selection ##
scaler = MinMaxScaler()
fh = FeatureHasher(n_features=6, input_type='string')
dumms_df = pd.get_dummies(df[cats_min], columns=cats_min)

# # including only numeric columns
# ndf = df[numeric_cols]
# # ndf = pd.DataFrame(scaler.fit_transform(ndf), columns=ndf.columns) # scaling numeric columns

# # encoding categorical data
num_df = df[numeric_cols]
# num_df = pd.DataFrame(scaler.fit_transform(num_df), columns=num_df.columns) # scaling numeric columns
hashed_df = fh.fit_transform(df[cats_top].astype(str).values)
hashed_df = pd.DataFrame(hashed_df.toarray(), columns=['name_1', 'name_2', 'name_3', 'name_4', 'name_5', 'name_6'])
ndf = pd.concat([num_df, dumms_df,hashed_df], axis=1)

In [525]:
# 1- Dataframe with dropping all rows with NaN values
naDF = ndf.dropna()

# removing outliers - naDF
Q1 = naDF.quantile(0.25, numeric_only=True)
Q3 = naDF.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

naDF = naDF[~((naDF < (Q1 - 1.5 * IQR)) | (naDF > (Q3 + 1.5 * IQR))).any(axis=1)]

# 2- Dataframe with filling all NaN values with fillna (mode) method
fnDF = ndf.copy()
for column in fnDF.columns:
  fnDF[column].fillna(fnDF[column].mode()[0], inplace=True)

# removing outliers - fnDF
Q1 = fnDF.quantile(0.25, numeric_only=True)
Q3 = fnDF.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

fnDF = fnDF[~((fnDF < (Q1 - 1.5 * IQR)) | (fnDF > (Q3 + 1.5 * IQR))).any(axis=1)]

In [526]:
# using models: linear regression, tree regressor, random forest, KNN
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor

trainDF = naDF.copy()
y = trainDF['price']
trainDF = trainDF.drop(columns=['price'])
X = trainDF

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33143121)

## Linear Regression
# lr = LinearRegression()
lr = Ridge(alpha=1.0)
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print(f"Linear Regression: {lr.score(X_test, y_test)}")

## Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=0, max_depth=5)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)
print(f"Decision Tree: {dt.score(X_test, y_test)}")

## KNN model
knn = KNeighborsRegressor(n_neighbors=10)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)
print(f"KNN: {knn.score(X_test, y_test)}")

# ## Random Forest Classifier - Only works for classification problems
# rf = RandomForestClassifier(n_estimators=100)
# rf.fit(X_train, y_train)
# y_pred = rf.predict(X_test)
# print(F"Random Forest: {accuracy_score(y_test, y_pred)}")

Linear Regression: 0.7137615790241527
Decision Tree: 0.6069517890190514
KNN: 0.18917770776432175


In [527]:
# saving the model and loading it
import pickle
pickle.dump(lr, open('./app/models/model_v2.pkl', 'wb'))

# model = pickle.load(open('./app/models/model_v1.pkl', 'rb'))
# print(model.predict([[2000, 5000, 70000, 4, 1]]))