In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score

from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import PolynomialFeatures

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [None]:
df = pd.read_csv("train-data.csv")

In [None]:
df.head()

In [None]:
df.shape

In [None]:
df.info()

In [None]:
# Description of Numeric Columns:
df.describe(include='number')

In [None]:
# Description of Object Columns:
df.describe(include='object')

In [None]:
df.isna().sum()

In [None]:
df.drop(columns = ['Unnamed: 0','New_Price'],inplace = True)

In [None]:
df.duplicated().sum()
# df[df.duplicated(keep=False)]

In [None]:
df['Mileage'] = df['Mileage'].str.split(expand=True)[0].astype(float)
df['Engine'] = df['Engine'].str.split(expand=True)[0].astype(float)

In [None]:
df['Power'] = df['Power'].replace('null bhp',None)
print('The number of data appearing as "null nhp":', (df['Power'] == 'null bhp').sum())
df['Power'] = df['Power'].str.split(expand=True)[0].astype(float)

In [None]:
# Taking a look at the first 5 rows of the dataset
df.head()

In [None]:
df.isna().sum()

In [None]:
df['Mileage'].fillna(df['Mileage'].mean(), inplace=True)
df['Engine'].fillna(df['Engine'].mean(), inplace=True)
df['Power'].fillna(df['Power'].mean(), inplace=True)
df['Seats'].fillna(df['Seats'].mean(), inplace=True)

In [None]:
df.isna().sum()

In [None]:
df.shape

<div style="border: 2px solid #000000; border-radius: 5px; padding: 10px; font-family: 'Times New Roman', Times, serif; font-size: 18px;">
    <div style="text-align: left;">
        <p style="font-size: 18px;">Removing an observation simply because it is an outlier is not an acceptable approach. These may be legitimate observations, and it is important to investigate why the outlier is the way it is before deciding whether to drop it. We are only allowed to remove outliers in the following two cases:</p>
        <ol style="font-size: 18px;">
            <li><em>If the outlier is caused by incorrectly entered or measured data</em></li>
            <li><em>If the outlier creates a significant relationship</em></li>
        </ol>
        <p style="font-size: 18px;">When the columns with outliers in the numeric data type are examined, it will be appropriate to perform operations only on the 'Kilometers_Driven' column.
        </p>
        <p style="font-size: 18px;"><em>IQR</em> or <em>Z-Score</em> can be used to identify outliers.</p>
    </div>
</div>

In [None]:
# Calculating Z-Score
z_scores = stats.zscore(df["Kilometers_Driven"])

# Identifying outliers using Z-Score
threshold = 2.5  # Usually 2.5 or 3 can be chosen
outliers = df['Kilometers_Driven'][abs(z_scores) > threshold]

outliers

In [None]:
df = df[df["Kilometers_Driven"] <= 300000]

In [None]:
df["Seats"].min()

In [None]:
df = df[df['Seats'] != 0]

In [None]:
df['Name'].nunique()

In [None]:
df.sample()

In [None]:
df['Company'] = df['Name'].str.split(expand=True)[0].str.lower()
df['Model'] = df['Name'].str.split().str[0:2].str.join(' ').str.lower()
df.drop(columns = ['Name'],inplace = True)

In [None]:
df['Company'].nunique() , df['Model'].nunique()

In [None]:
new_order = ['Company', 'Model', 'Location', 'Owner_Type', 'Fuel_Type', 'Transmission',
             'Year', 'Kilometers_Driven', 'Mileage', 'Engine', 'Power', 'Seats', 'Price']
df = df.reindex(columns=new_order)

In [None]:
df.sample()

In [None]:
plt.figure(figsize=(20, 4), dpi=100)
graph = sns.countplot(data=df, x='Company')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Company"', fontsize=15)
for cont in graph.containers:
        graph.bar_label(cont)
plt.show()

In [None]:
df.Model.value_counts()

In [None]:
threshold = 10
counts = df['Model'].value_counts()
filtered_models = counts[counts >= threshold].index.tolist()
df = df[df['Model'].isin(filtered_models)]

In [None]:
plt.figure(figsize=(20,6), dpi=100)
graph = sns.countplot(data=df, x='Model')
plt.xticks(rotation=90, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Model"', fontsize=15)
for cont in graph.containers:
        graph.bar_label(cont)
plt.show()

In [None]:
plt.figure(figsize=(20,4), dpi=100)
graph = sns.countplot(data=df, x='Location')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Location"', fontsize=15)
for cont in graph.containers:
        graph.bar_label(cont)
plt.show()

In [None]:
plt.figure(figsize=(10, 4), dpi=100)
graph = sns.countplot(data=df, x='Owner_Type')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Owner_Type"', fontsize=15)
for cont in graph.containers:
        graph.bar_label(cont)
plt.show()

In [None]:
plt.figure(figsize=(10, 4), dpi=100)
graph = sns.countplot(data=df, x='Fuel_Type')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Fuel_Type"', fontsize=15)
for cont in graph.containers:
        graph.bar_label(cont)
plt.show()

In [None]:
plt.figure(figsize=(10, 4), dpi=100)
graph = sns.countplot(data=df, x='Transmission')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Transmission"', fontsize=15)
for cont in graph.containers:
        graph.bar_label(cont)
plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
graph = sns.histplot(data=df, x='Year', bins=21, # 1998-2019 arası olduğu için
                     kde=True, edgecolor='white', alpha=0.5, line_kws={'lw': 2.5})
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Year"', fontsize=15)

textstr = '\n'.join((
    r'$\mu=%.2f$' %df['Year'].mean(),
    r'$\sigma=%.2f$' %df['Year'].std(),
    r'$\mathrm{median}=%.2f$' %np.median(df['Year']),
    r'$\mathrm{min}=%.2f$' %df['Year'].min(),
    r'$\mathrm{max}=%.2f$' %df['Year'].max()
))

plt.text(0.05, 0.7, textstr, transform=graph.transAxes, fontsize=14, bbox=dict(facecolor='white', alpha=0.7))

plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
graph = sns.histplot(data=df, x='Kilometers_Driven', kde=True, edgecolor='white', alpha=0.5, line_kws={'lw': 2})
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Kilometers_Driven"', fontsize=15)

textstr = '\n'.join((
    r'$\mu=%.2f$' %df['Kilometers_Driven'].mean(),
    r'$\sigma=%.2f$' %df['Kilometers_Driven'].std(),
    r'$\mathrm{median}=%.2f$' %np.median(df['Kilometers_Driven']),
    r'$\mathrm{min}=%.2f$' %df['Kilometers_Driven'].min(),
    r'$\mathrm{max}=%.2f$' %df['Kilometers_Driven'].max()
))

plt.text(0.725, 0.7, textstr, transform=graph.transAxes, fontsize=14, bbox=dict(facecolor='white', alpha=0.7))

plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
graph = sns.histplot(data=df, x='Mileage', kde=True, edgecolor='white', alpha=0.5, line_kws={'lw': 2.5})
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Mileage"', fontsize=15)

textstr = '\n'.join((
    r'$\mu=%.2f$' %df['Mileage'].mean(),
    r'$\sigma=%.2f$' %df['Mileage'].std(),
    r'$\mathrm{median}=%.2f$' %np.median(df['Mileage']),
    r'$\mathrm{min}=%.2f$' %df['Mileage'].min(),
    r'$\mathrm{max}=%.2f$' %df['Mileage'].max()
))

plt.text(0.725, 0.7, textstr, transform=graph.transAxes, fontsize=14, bbox=dict(facecolor='white', alpha=0.7))

plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
graph = sns.histplot(data=df, x='Engine', kde=True, edgecolor='white', alpha=0.5, line_kws={'lw': 2.5})
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Engine"', fontsize=15)

textstr = '\n'.join((
    r'$\mu=%.2f$' %df['Engine'].mean(),
    r'$\sigma=%.2f$' %df['Engine'].std(),
    r'$\mathrm{median}=%.2f$' %np.median(df['Engine']),
    r'$\mathrm{min}=%.2f$' %df['Engine'].min(),
    r'$\mathrm{max}=%.2f$' %df['Engine'].max()
))

plt.text(0.725, 0.7, textstr, transform=graph.transAxes, fontsize=14, bbox=dict(facecolor='white', alpha=0.7))

plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
graph = sns.histplot(data=df, x='Power', kde=True, edgecolor='white', alpha=0.5, line_kws={'lw': 2.5})
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Power"', fontsize=15)

textstr = '\n'.join((
    r'$\mu=%.2f$' %df['Power'].mean(),
    r'$\sigma=%.2f$' %df['Power'].std(),
    r'$\mathrm{median}=%.2f$' %np.median(df['Power']),
    r'$\mathrm{min}=%.2f$' %df['Power'].min(),
    r'$\mathrm{max}=%.2f$' %df['Power'].max()
))

plt.text(0.75, 0.7, textstr, transform=graph.transAxes, fontsize=14, bbox=dict(facecolor='white', alpha=0.7))

plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
graph = sns.histplot(data=df, x='Seats', bins=11, kde=True, edgecolor='white', alpha=0.5, line_kws={'lw': 2.5})
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Seats"', fontsize=15)

textstr = '\n'.join((
    r'$\mu=%.2f$' %df['Seats'].mean(),
    r'$\sigma=%.2f$' %df['Seats'].std(),
    r'$\mathrm{median}=%.2f$' %np.median(df['Seats']),
    r'$\mathrm{min}=%.2f$' %df['Seats'].min(),
    r'$\mathrm{max}=%.2f$' %df['Seats'].max()
))

plt.text(0.755, 0.7, textstr, transform=graph.transAxes, fontsize=14, bbox=dict(facecolor='white', alpha=0.7))

plt.show()

In [None]:
plt.figure(figsize=(10, 6), dpi=100)
graph = sns.histplot(data=df, x='Price', bins=11, kde=True, edgecolor='white', alpha=0.5, line_kws={'lw': 2.5})
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.title('"Price"', fontsize=15)

textstr = '\n'.join((
    r'$\mu=%.2f$' %df['Price'].mean(),
    r'$\sigma=%.2f$' %df['Price'].std(),
    r'$\mathrm{median}=%.2f$' %np.median(df['Price']),
    r'$\mathrm{min}=%.2f$' %df['Price'].min(),
    r'$\mathrm{max}=%.2f$' %df['Price'].max()
))

plt.text(0.755, 0.7, textstr, transform=graph.transAxes, fontsize=14, bbox=dict(facecolor='white', alpha=0.7))

plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.stripplot(x='Company', y='Price', data=df, size=3)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100]) # The y-axis is limited to 100 to make it look better
plt.title('Price vs Company', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.stripplot(x='Location', y='Price', data=df, size=3)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100])
plt.title('Price vs Location', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.stripplot(x='Owner_Type', y='Price', data=df, size=2.5)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100])
plt.title('Price vs Owner_Type', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.stripplot(x='Fuel_Type', y='Price', data=df, size=4)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100])
plt.title('Price vs Fuel_Type', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.stripplot(x='Transmission', y='Price', data=df, size=4)
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100])
plt.title('Price vs Transmission', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.scatterplot(data=df, x='Year', y='Price')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.title('Price vs Year', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.scatterplot(data=df, x='Kilometers_Driven', y='Price')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.title('Price vs Kilometers_Driven', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.scatterplot(data=df, x='Mileage', y='Price')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.title('Price vs Mileage', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.scatterplot(data=df, x='Engine', y='Price')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100])
plt.title('Price vs Engine', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.scatterplot(data=df, x='Power', y='Price')
plt.xticks(rotation=0, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100])
plt.title('Price vs Power', size=15)
plt.show()

In [None]:
plt.figure(figsize=(15, 8), dpi=100)
sns.scatterplot(data=df, x='Seats', y='Price')
plt.xticks(rotation=45, fontsize=12)
plt.yticks(fontsize=12)
plt.ylim([0,100])
plt.title('Price vs Seats', size=15)
plt.show()

In [None]:
CatCols = ['Company', 'Model', 'Location', 'Owner_Type', 'Fuel_Type', 'Transmission']

df = pd.get_dummies(df, columns=CatCols, drop_first=True)
df.head(5)

In [None]:
plt.figure(figsize=(10, 6))
sns.heatmap(df.corr(), annot=False, cmap='Blues')
plt.show()

In [None]:
X = df.drop('Price', axis=1)
y = df['Price']

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
print('X_train shape: ', X_train.shape)
print('X_test shape: ', X_test.shape)
print('y_train shape: ', y_train.shape)
print('y_test shape: ',y_test.shape)

In [None]:
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

<div style="border: 2px solid #000000; border-radius: 5px; padding: 10px; font-family: 'Times New Roman', Times, serif; font-size: 18px;">
    <div style="text-align: left;">
        <p><strong>There are some performance measures used in statistics and data analysis processes. These metrics are used to evaluate a model's predictive ability and success. These metrics include Mean Absolute Error (MAE), Mean Square Error (MSE), Root Mean Square Error (RMSE), and R-Squared Score (R^2).</strong></p>
        <ol>
            <li><strong>Mean Absolute Error (MAE):</strong> It expresses the average of the absolute values of the differences between the predicted values and the actual values. A lower MAE value indicates that the model is performing better and its predictions are closer to the true values. This measurement is used to evaluate the accuracy of the model in regression problems. </li>
            <li><strong>Mean Square Error (MSE):</strong> It represents the average of the squared differences between predicted values and actual values. The value of MSE is always positive. The MSE value generally emphasizes large error values because the impact of large errors is increased by squaring the differences. It is used to evaluate the accuracy of the model in regression problems. </li>
            <li><strong>Root Mean Square Error (RMSE):</strong> It is calculated by taking the square root of MSE. RMSE allows errors to be interpreted by returning them to the original unit of measurement and is a derivative of MSE. It is used to evaluate the accuracy of the model in regression problems.</li>
            <li><strong>R-Squared Score (R^2):</strong> It is a metric that measures how well a regression model fits the data. This score indicates how much of the variance on the dependent variable is explained by the independent variables. R^2 takes a value between 0 and 1.</li>
        </ol>
        <p><strong>When these metrics are used together, you can more comprehensively evaluate a model's performance. For example:</strong></p>
        <ul>
            <li>Lower MAE and RMSE values indicate that a better model makes fewer errors.</li>
            <li>There is a direct relationship between MSE and RMSE because RMSE is the square root of MSE.</li>
            <li>The R-Squared Score indicates how well the fit to the data is. The closer it is to 1, the better the model fits the data. As it approaches 0, the model becomes less able to fit the data.</li>
        </ul>
        <p><strong>In summary, by using these metrics together, it is possible to understand and compare the performance of a model. Lower error values (MAE, MSE, RMSE) and higher R-Squared Score indicate the presence of a better model.</strong></p>
    </div>
</div>

In [None]:
def evaluation(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r_squared = r2_score(y_test, y_pred)
    return mae, mse, rmse, r_squared

In [None]:
y_pred = lin_reg.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, y_pred)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

train_score = list()
test_score = list()

train_score.append(lin_reg.score(X_train,y_train))
test_score.append(lin_reg.score(X_test,y_test))

In [None]:
ridge = Ridge()
ridge.fit(X_train, y_train)

In [None]:
predictions = ridge.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

train_score.append(ridge.score(X_train,y_train))
test_score.append(ridge.score(X_test,y_test))

In [None]:
lasso = Lasso()
lasso.fit(X_train, y_train)

In [None]:
predictions = lasso.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

train_score.append(lasso.score(X_train,y_train))
test_score.append(lasso.score(X_test,y_test))

In [None]:
elastic_net = ElasticNet()
elastic_net.fit(X_train, y_train)

In [None]:
predictions = elastic_net.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

train_score.append(elastic_net.score(X_train,y_train))
test_score.append(elastic_net.score(X_test,y_test))

In [None]:
random_forest = RandomForestRegressor(n_estimators=50, max_depth=10, min_samples_split=5)
random_forest.fit(X_train, y_train)

In [None]:
predictions = random_forest.predict(X_test)

mae, mse, rmse, r_squared = evaluation(y_test, predictions)
print("MAE:", mae)
print("MSE:", mse)
print("RMSE:", rmse)
print("R2 Score:", r_squared)

train_score.append(random_forest.score(X_train,y_train))
test_score.append(random_forest.score(X_test,y_test))

In [None]:
import matplotlib.pyplot as plt

def plot_actual_vs_predicted(ax, model_name, y_test, predictions):
    ax.scatter(y_test, predictions)
    ax.plot(y_test, y_test, color='gray', linestyle='--')
    ax.set_title(f'Actual vs Predicted Prices ({model_name})')
    ax.set_xlabel('Actual Prices')
    ax.set_ylabel('Predicted Prices')
    ax.grid(True)

fig, axs = plt.subplots(3, 2, figsize=(16, 12))
axs = axs.flatten()

lin_reg_predictions = lin_reg.predict(X_test)
plot_actual_vs_predicted(axs[0], 'Linear Regression', y_test, lin_reg_predictions)

ridge_predictions = ridge.predict(X_test)
plot_actual_vs_predicted(axs[1], 'Ridge Regression', y_test, ridge_predictions)

lasso_predictions = lasso.predict(X_test)
plot_actual_vs_predicted(axs[2], 'Lasso Regression', y_test, lasso_predictions)

elastic_net_predictions = elastic_net.predict(X_test)
plot_actual_vs_predicted(axs[3], 'Elastic Net Regression', y_test, elastic_net_predictions)

random_forest_predictions = random_forest.predict(X_test)
plot_actual_vs_predicted(axs[4], 'Random Forest Regression', y_test, random_forest_predictions)

axs[5].axis('off')

plt.tight_layout()
plt.show()


In [None]:
model_names = ['Linear Regression','Ridge Regression','Lasso Regression','Elastic-Net Regression','Random Forest Regression']

scores = pd.DataFrame([model_names,train_score,test_score])
scores = scores.transpose()
scores.columns = [ 'Model','Training Set Accuracy','Test Set Accuracy']

scores