In [3]:
import pandas as pd
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

df = pd.read_csv('./content/results/cars_v1.csv')
df_cols = ['name', 'model', 'year', 'price', 'color', 'fuelType', 'carOrigin', 'carInsurance', 'gearType', 'mirrorType', 'motorPower', 'drivenKm', 'passengers', 'paymentMethod', 'saleType', 'secondHandStatus']

In [4]:
import matplotlib.pyplot as plt
# 1- Dataframe with dropping all rows with NaN values
naDF = df.dropna()

# removing outliers 
Q1 = naDF.quantile(0.25, numeric_only=True)
Q3 = naDF.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

naDF = naDF[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# 2- Dataframe with filling all NaN values with fillna (mode) method
fnDF = df.fillna(df.mode())

# removing outliers 
Q1 = fnDF.quantile(0.25, numeric_only=True)
Q3 = fnDF.quantile(0.75, numeric_only=True)
IQR = Q3 - Q1

fnDF = naDF[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]

# naDF[['price','drivenKm']].plot(kind='scatter', y='drivenKm', x='price')
# plt.show()

  naDF = naDF[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
  naDF = naDF[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
  fnDF = naDF[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]
  fnDF = naDF[~((df < (Q1 - 1.5 * IQR)) | (df > (Q3 + 1.5 * IQR))).any(axis=1)]


In [5]:
# using models: linear regression, tree regressor, random forest, KNN
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestClassifier


X = naDF[['year', 'motorPower', 'drivenKm', 'passengers', 'secondHandStatus']]
y = naDF['price']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=33143121)

## Linear Regression
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)

print(f"Linear Regression: {lr.score(X_test, y_test)}")

## Decision Tree Regressor
dt = DecisionTreeRegressor(random_state=0)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_test)

print(f"Decision Tree: {dt.score(X_test, y_test)}")

## KNN model
knn = KNeighborsRegressor(n_neighbors=5)
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

print(f"KNN: {knn.score(X_test, y_test)}")

## Random Forest Classifier
rf = RandomForestClassifier(n_estimators=20)
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)
print(F"Random Forest: {accuracy_score(y_test, y_pred)}")

Linear Regression: 0.6148694194952673
Decision Tree: 0.24418820013033327
KNN: 0.3023218706905466
Random Forest: 0.04638218923933209


In [6]:
# testing linear regression model
myCar = [[2000, 5000, 70000, 4, 1]]
pred = lr.predict(myCar)
print(pred)

[171562.24049736]




In [8]:
# saving the model and loading it
import pickle
pickle.dump(lr, open('./app/models/model_v1.pkl', 'wb'))

model = pickle.load(open('./app/models/model_v1.pkl', 'rb'))
print(model.predict([[2000, 5000, 70000, 4, 1]]))

[171562.24049736]


