In [None]:
#importing the necessery libraries
import mysql.connector
import numpy as np
import pandas as pd
import re
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,MinMaxScaler
import statistics
from sklearn.metrics import mean_squared_error, r2_score
from joblib import dump, load
import json

In [None]:
conn = mysql.connector.connect(user='root', password='admin',
                               host='localhost', database='main_database')

In [None]:
sql_query = "SELECT * FROM cars"
df_original = pd.read_sql(sql_query, con=conn)
conn.close()

In [None]:
df=df_original.copy()

In [None]:
df.sort_values('price',ascending=True)

In [None]:
#cleaning the data
df.drop(df[df['location'].str.contains(re.compile(r'^/$', flags=re.IGNORECASE), na=False)].index, inplace=True)
df.drop(df[df['price'] < 50000000].index, inplace=True)
df.drop(df[df['price'] > 12000000000].index, inplace=True)

In [None]:
freq = df.groupby(["brand", "model"]).size().reset_index(name="count")
freq.sort_values('count',ascending=False)
top_five = freq.sort_values('count',ascending=False).head(5)
# Create a new dataframe with the brand and model columns of the top five rows
top_five_combo= pd.DataFrame({'brand': top_five['brand'], 'model': top_five['model']})
# Print the results
top_five_combo

In [None]:
merged_df = pd.merge(top_five_combo, df, on=['brand', 'model'], how='inner')
merged_df['brand_model']=merged_df['brand']+merged_df['model']
merged_df

In [None]:
sns.scatterplot(data=merged_df,x='mileage', y='price', hue='brand_model')

In [None]:
merged_df['brand_model'] = pd.Categorical(merged_df['brand_model'], categories=merged_df['brand_model'].unique())
fig, ax = plt.subplots()
scatter = ax.scatter(merged_df['mileage'], merged_df['price'], c=merged_df['brand_model'].cat.codes)
ax.set_title('Car Prices vs. Mileage')
ax.set_xlabel('mileage')
ax.set_ylabel('price')
handles, labels = scatter.legend_elements()
ax.legend(handles, (merged_df['brand_model'].unique()), loc='upper right')
plt.show()

In [None]:
df.loc[df['prod_year'] < 1410, 'prod_year'] += 621

In [None]:
#normalize with z score
columns_to_normalize = ['mileage', 'prod_year', 'price']
scaler_main = StandardScaler()
df[columns_to_normalize[:2]] = scaler_main.fit_transform(df[columns_to_normalize[:2]])
#normalize price and use log to avoid negative prices
scaler_price = MinMaxScaler()
df[columns_to_normalize[2]] = scaler_price.fit_transform(df[[columns_to_normalize[2]]])
df['price'] = np.log(df['price'])
#merge the brand, model and trim together
df['merged'] = df['brand'] + '_' + df['model'] + '_' + df['trim']

In [None]:
sorted_df = df.sort_values('price')
top_row_id = sorted_df.iloc[0]['id']
df.drop(df[df['id'] == top_row_id].index, inplace=True)
df.sort_values('price',ascending=False)

In [None]:
#encode models which are categorical values using mean encoding method
df['mean_encoded_model'] = 0
mean_encoded = df.groupby('merged')['price'].mean()
for value in df['merged'].unique():
    df.loc[df['merged'] == value, 'mean_encoded_model'] = mean_encoded[value]
#encoding locations
merged_models='پژو_ 206_تیپ 2'
loc_df = df.loc[df['merged'] == merged_models].copy()
df['mean_encoded_location'] = 0
mean_encoded = loc_df.groupby('location')['price'].mean()
mean_encoded_dict = mean_encoded.to_dict()
default_mean = statistics.mean(mean_encoded_dict.values())
for value in df['location'].unique():
    if value in mean_encoded:
        df.loc[df['location'] == value, 'mean_encoded_location'] = mean_encoded[value]
    else:
        df.loc[df['location'] == value, 'mean_encoded_location'] = default_mean

In [None]:
#split the x and y columns
X=df.loc[:,['mean_encoded_model','prod_year','mileage','mean_encoded_location']]
y=df.loc[:,['price']]

In [None]:
unique_values = list(df['merged'].unique())
value_dict = {val: i for i, val in enumerate(unique_values)}
df['merged_num'] = df['merged'].map(value_dict)

In [None]:
value_dict_1=value_dict
value_dict_2 =df.groupby('merged_num')['mean_encoded_model'].unique().to_dict()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=27)

In [None]:
X_train, X_acc, y_train, y_acc = train_test_split(X_train, y_train, test_size=0.1, random_state=27)

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

In [None]:
from sklearn import svm
regr = svm.SVR()
regr.fit(X_train, y_train)
y_pred = regr.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

In [None]:
from sklearn.linear_model import Ridge
clf = Ridge(alpha=1.0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

In [None]:
from sklearn.linear_model import SGDRegressor
reg = SGDRegressor(max_iter=1000, tol=1e-3)
reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

In [None]:
from sklearn.linear_model import HuberRegressor
huber = HuberRegressor()
huber.fit(X_train, y_train)
y_pred = huber.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

In [None]:
from sklearn.ensemble import RandomForestRegressor
rndm = RandomForestRegressor(random_state=0)
rndm.fit(X_train, y_train)
y_pred = rndm.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

In [None]:
from sklearn.tree import DecisionTreeRegressor
dsc=DecisionTreeRegressor(random_state=0)
dsc.fit(X_train, y_train)
y_pred = dsc.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
grb = GradientBoostingRegressor()
grb.fit(X_train, y_train)
y_pred = grb.predict(X_test)
# calculate RMSE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print the metrics
print('Mean squared error (MSE):', mse)
print('Root mean squared error (RMSE):', rmse)
print('R-squared (R²) score:', r2)

<h1>Export the model and dictionaries
</h1>


In [None]:
value_dict_1=value_dict
value_dict_2 =df.groupby('merged_num')['mean_encoded_model'].unique().to_dict()
location_dict=mean_encoded_dict
dump(scaler_main, 'scaler_main.joblib')
dump(scaler_price, 'scaler_price.joblib')

for key in value_dict_2:
     value_dict_2[key] = [float(i) for i in value_dict_2[key]]
for key in value_dict_2:
     value_dict_2[key] = value_dict_2[key][0]
with open('value_dict_1.json', 'w') as f:
    json.dump(value_dict_1, f)
with open('value_dict_2.json', 'w') as f:
    json.dump(value_dict_2, f)
with open('location_dict.json', 'w') as f:
    json.dump(location_dict, f)
dump(rndm, 'random_forrest_model.joblib')
dump(grb, 'gradient_boosting_model.joblib')
dump(dsc, 'desicion_tree_model.joblib')

In [None]:
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping
input_dim = 4
# Define the architecture of the neural network
model = tf.keras.Sequential([
    tf.keras.layers.Dense(32, activation='relu', input_shape=(input_dim,)),
    tf.keras.layers.Dense(16, activation='relu'),
    tf.keras.layers.Dense(1)  # Output layer with 1 unit for regression
])
# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')
early_stopping = EarlyStopping(monitor='loss', patience=3)
# Train the model
model.fit(X_train, y_train, epochs=200, batch_size=32, callbacks=[early_stopping])
# Evaluate the model
loss = model.evaluate(X_test, y_test)
print("Validation Loss:", loss)
# Use the trained model to make predictions
predictions = model.predict(X_acc)
# Calculate R2 score
r2 = r2_score(y_acc, predictions)
print("R2 Score:", r2)
# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_acc, predictions))
print("RMSE:", rmse)