In [None]:
# 🚗 Car Purchase Amount Prediction

# 1. Import Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. Load Dataset
df = pd.read_csv('../data/car_purchasing.csv', encoding='latin1')
print(df.head())

# 3. Preprocessing
df = df.drop(['customer name', 'customer e-mail', 'country'], axis=1)

X = df.drop('car purchase amount', axis=1)
y = df['car purchase amount']

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 5. Train Models
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)

rf = RandomForestRegressor()
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)

# 6. Evaluation
print("Linear Regression R2 Score:", r2_score(y_test, y_pred_lr))
print("Random Forest R2 Score:", r2_score(y_test, y_pred_rf))

# 7. Feature Importance (Random Forest)
importances = rf.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X.columns, 'Importance': importances})
feature_importance_df = feature_importance_df.sort_values('Importance', ascending=False)

plt.figure(figsize=(10,6))
sns.barplot(x='Importance', y='Feature', data=feature_importance_df)
plt.title('Feature Importance')
plt.savefig('../images/feature_importance.png')
plt.show()

# 8. Actual vs Predicted Plot
plt.figure(figsize=(8,6))
plt.scatter(y_test, y_pred_rf)
plt.xlabel('Actual Car Purchase Amount')
plt.ylabel('Predicted Car Purchase Amount')
plt.title('Actual vs Predicted - Random Forest')
plt.savefig('../images/prediction_vs_actual.png')
plt.show()
