In [1]:
import pandas as pd


In [2]:
df = pd.read_csv('/content/data.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '/content/data.csv'

In [None]:
df.info()

In [None]:
# notice that all float columns excent the target are actually int, convert them to int
float_cols = df.drop('price_eur', axis=1).select_dtypes(include=['float64']).columns

# convert to integer type, keeping NaNs
df[float_cols] = df[float_cols].astype('Int64')


In [None]:
df.isnull().sum()

In [None]:
# calculate missing values percentage per column
missing = df.isna().mean() * 100
missing

In [None]:
df = df.drop(columns=['color_slug', 'stk_year']) # too many missing values
df = df.drop(columns=['date_last_seen', 'date_created']) # doesnt say anything valuable about the car


In [None]:
# dividing the columns into the respective types so that we impute the median for the numerical missing values and the mode for object type

obj_cols = df.select_dtypes(include=['object']).columns
num_cols = df.select_dtypes(include=['Int64']).columns


In [None]:
# fill missing values with the mode of each object type column
for col in obj_cols:
    mode_val = df[col].mode()[0]  # mode() returns a Series, take the first value
    df[col] = df[col].fillna(mode_val)

# fill missing values with the median of each Int64 type column
for col in num_cols:
    median_val = df[col].median()
    df[col] = df[col].fillna(median_val)


In [None]:
df.isnull().sum()

In [None]:
df.head()

In [None]:
# encode the values from object type columns to numeric values for further processing
from sklearn.preprocessing import LabelEncoder

for col in obj_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col].astype(str)) + 1 # added the +1 to make sure the 0 doesnt make null values in further calculations

In [None]:
df.head()

In [None]:
# check for outliers before scaling
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))
df[num_cols].boxplot(rot=45)
plt.title("outlier detection")
plt.show()


In [None]:
# plot didnt offer much significant information, lets check their actual number

for col in num_cols:
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3 - Q1

  outliers = df[(df[col] < Q1 - 1.5*IQR) | (df[col] > Q3 + 1.5*IQR)]
  num_outliers = outliers.shape[0]

  print(f"number of outliers in '{col}': {num_outliers}")


In [None]:
df.describe()

In [66]:
# some extreme values are indicated in the table, such as maximum values for price and mileage, so i will scale these columns
from sklearn.preprocessing import RobustScaler # less sensitive to outliers, based on IQR and median
num_cols.append(pd.Index(df['price_eur']))

# scale only those with range > 10000
scaler = RobustScaler()
for col in num_cols:
    col_range = df[col].max() - df[col].min()
    if col_range > 10000:
        df[[col]] = scaler.fit_transform(df[[col]])



TypeError: all inputs must be Index

In [None]:
# correlation of target column to all columns is very low, but approximately in the same range for all of them
# lets divide the data into training and test sets and train the model

from sklearn.model_selection import train_test_split

X = df.drop('price_eur', axis=1)
y = df['price_eur']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"mean squared error: {mse:.2f}")
print(f"r2 Score: {r2:.2f}")


In [None]:
# something is definitely done wrong, let me try analyzing everything one more

In [None]:
df.head()

In [None]:
# i suppose i didnt process the outliers enough
# i'll tryy capping at 1st and 99th percentiles
for col in num_cols:
    lower = df[col].quantile(0.01)
    upper = df[col].quantile(0.99)
    df[col] = df[col].clip(lower, upper)

# log-transform target to compress extremes
import numpy as np
df['price_eur'] = np.log1p(df['price_eur'])


In [None]:
df.head()

In [None]:
df.describe()

In [None]:
# try with the model oce more

X = df.drop('price_eur', axis=1)
y = df['price_eur']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"mean squared error: {mse:.2f}")
print(f"r2 Score: {r2:.2f}")


In [None]:
# much better, but there is still room for imrpovement
# maybe feature engineering could work

In [None]:
df.columns

In [None]:
df['car_age'] = 2025 - df['manufacture_year']
df['mileage_per_year'] = df['mileage'] / (2025 - df['manufacture_year'] + 1)
df['high_mileage'] = (df['mileage'] > df['mileage'].quantile(0.75)).astype(int)


In [None]:
df = df.drop(columns=['mileage', 'manufacture_year']) # because the new columns contain all information from these

In [None]:
# try with the model oce more

X = df.drop('price_eur', axis=1)
y = df['price_eur']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"mean squared error: {mse:.2f}")
print(f"r2 Score: {r2:.2f}")


In [None]:
# no changes, lets try dimensionality reduction
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

X = df.values

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)

df_pca = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])



In [None]:
X = df_pca.drop('price_eur', axis=1)
y = df_pca['price_eur']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"mean squared error: {mse:.2f}")
print(f"r2 Score: {r2:.2f}")
