In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

plt.rcParams["figure.figsize"] = (10,6)
pd.set_option('display.max_columns', 100)

In [17]:
data = pd.read_csv('final_scout_dummy2022_short2.csv')

In [18]:
df = data.copy()

In [19]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21769 entries, 0 to 21768
Data columns (total 40 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   location                             21769 non-null  object 
 1   price                                21769 non-null  float64
 2   type_ordinal                         21769 non-null  float64
 3   warranty_ordinal                     21769 non-null  float64
 4   gearbox_ordinal                      21769 non-null  float64
 5   mileage                              21769 non-null  float64
 6   fuel_type                            21769 non-null  float64
 7   seller                               21769 non-null  float64
 8   engine_size                          21769 non-null  float64
 9   gears                                21769 non-null  float64
 10  co_emissions                         21769 non-null  float64
 11  drivetrain                  

In [22]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,21769.0,18732.678809,9755.227127,490.0,11990.0,16950.0,24480.0,51940.0
type_ordinal,21769.0,0.196242,0.598742,0.0,0.0,0.0,0.0,3.0
warranty_ordinal,21769.0,0.489457,0.4999,0.0,0.0,0.0,1.0,1.0
gearbox_ordinal,21769.0,0.345721,0.479174,0.0,0.0,0.0,1.0,2.0
mileage,21769.0,66752.320723,55616.908617,0.0,20600.0,55500.0,99930.0,240364.0
fuel_type,21769.0,0.415729,0.601116,0.0,0.0,0.0,1.0,3.0
seller,21769.0,0.94607,0.225885,0.0,1.0,1.0,1.0,1.0
engine_size,21769.0,1509.619045,437.771369,0.0,1199.0,1498.0,1798.0,3597.0
gears,21769.0,5.699389,1.388427,1.0,5.0,6.0,6.0,8.0
co_emissions,21769.0,120.443302,40.875942,0.0,105.0,119.0,139.0,263.0


In [32]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Separate features and target
X = df.drop(columns=['price'])
y = df['price']

# Identify categorical features
categorical_features = ['location']  # Add any other categorical features here
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

# Create a preprocessor with appropriate transformations
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

# Create a pipeline that first transforms the data then applies a model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the model
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Display model coefficients (optional, only for linear regression)
coefficients = pipeline.named_steps['regressor'].coef_
features = numerical_features + pipeline.named_steps['preprocessor'].transformers_[1][1].get_feature_names_out(categorical_features).tolist()
coeff_df = pd.DataFrame(coefficients, index=features, columns=['Coefficient'])
print(coeff_df)

Mean Squared Error: 15395218.606661238
R-squared: 0.8399003175748253
                                                    Coefficient
type_ordinal                                         688.841034
warranty_ordinal                                     129.983501
gearbox_ordinal                                     1187.908539
mileage                                            -2564.797661
fuel_type                                            279.773951
...                                                         ...
location_Äussere Nuernberger Strasse 60,  91301...  -131.688686
location_Äußere Leipziger Str. 82,  04435 Schke... -1654.459567
location_Östliche Weinstraße 2,  93309 Kelheim, DE  4030.651106
location_Ötterichweg 3,  90411 Nürnberg, DE        -7933.654728
location_Ückendorfer Str. 54,  45327 Essen, DE      -260.699100

[5589 rows x 1 columns]


In [39]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from category_encoders import TargetEncoder

# Separate features and target
X = df.drop(columns=['price'])
y = df['price']

# Identify categorical and numerical features
categorical_features = ['location']  # Add any other categorical features here
numerical_features = X.select_dtypes(exclude=['object']).columns.tolist()

# Create a target encoder for the categorical features
target_encoder = TargetEncoder(cols=categorical_features, smoothing=0.5)

# Initialize an empty DataFrame to store the encoded categorical features
X_encoded = X.copy()

# Initialize a KFold cross-validator
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Apply target encoding with KFold cross-validation to avoid data leakage
for train_index, val_index in kf.split(X):
    X_train_fold, X_val_fold = X.iloc[train_index], X.iloc[val_index]
    y_train_fold = y.iloc[train_index]
    
    # Fit the target encoder on the training fold
    target_encoder.fit(X_train_fold[categorical_features], y_train_fold)
    
    # Transform the validation fold and store the encoded values
    X_encoded.iloc[val_index] = X_encoded.iloc[val_index].assign(
        **{col: target_encoder.transform(X_val_fold[col]) for col in categorical_features}
    )

# Replace the original categorical columns with their encoded values
X[categorical_features] = X_encoded[categorical_features]

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the target encoder on the entire training set and transform the test set
target_encoder.fit(X_train[categorical_features], y_train)
X_test[categorical_features] = target_encoder.transform(X_test[categorical_features])

# Create a preprocessing pipeline for numerical features
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine the transformers into a column transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features)
    ])

# Create a pipeline that first transforms the data then applies a model
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions
y_pred = pipeline.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')

# Display model coefficients (optional, only for linear regression)
coefficients = pipeline.named_steps['regressor'].coef_
features = numerical_features + categorical_features
coeff_df = pd.DataFrame(coefficients, index=features, columns=['Coefficient'])
print(coeff_df)

Mean Squared Error: 14544250.465659825
R-squared: 0.8487498008208325


ValueError: Shape of passed values is (38, 1), indices imply (39, 1)