In [9]:
# 1. Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# 2. Load the data 
file_path = 'car_purchasing.csv' 
df = pd.read_csv(file_path, encoding='latin1')

# 3. Clean column names (lowercase and strip spaces)
df.columns = df.columns.str.lower().str.strip()

# 4. Explore the data
print(df.head())
print("\nDataset Info:\n")
print(df.info())
print("\nMissing Values:\n")
print(df.isnull().sum())

# 5. Handle missing values
df = df.dropna()

# 6. Detect and handle outliers (using IQR method)
def remove_outliers(dataframe, columns):
    for col in columns:
        if dataframe[col].dtype in [np.float64, np.int64]:
            Q1 = dataframe[col].quantile(0.25)
            Q3 = dataframe[col].quantile(0.75)
            IQR = Q3 - Q1
            lower = Q1 - 1.5 * IQR
            upper = Q3 + 1.5 * IQR
            dataframe = dataframe[(dataframe[col] >= lower) & (dataframe[col] <= upper)]
    return dataframe

# Apply outlier removal
numeric_cols = ['age', 'annual salary', 'credit card debt', 'net worth', 'car purchase amount']
df = remove_outliers(df, numeric_cols)

# 7. Feature Selection
drop_cols = ['car purchase amount', 'customer name', 'customer e-mail', 'country']
X = df.drop(drop_cols, axis=1)
y = df['car purchase amount']

# 8. Feature Scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 9. Train/test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# 10. Model Training
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(n_estimators=100, random_state=42),
    "XGBoost": XGBRegressor(n_estimators=100, learning_rate=0.1, random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    results[name] = {"MAE": mae, "RMSE": rmse, "R2 Score": r2}
    
    print(f"\n{name} Performance:")
    print(f"MAE: {mae:.2f}")
    print(f"RMSE: {rmse:.2f}")
    print(f"R2 Score: {r2:.2f}")

# 11. Model Comparison
result_df = pd.DataFrame(results).T
print("\nModel Comparison:\n")
print(result_df)

# 12. Feature Importance (using the best model)
best_model_name = max(results, key=lambda k: results[k]['R2 Score'])
print(f"\nBest Model: {best_model_name}")

if best_model_name in ["Random Forest", "XGBoost"]:
    best_model = models[best_model_name]
    feature_importances = best_model.feature_importances_
    features = X.columns

    importance_df = pd.DataFrame({'Feature': features, 'Importance': feature_importances})
    importance_df = importance_df.sort_values(by='Importance', ascending=False)

    plt.figure(figsize=(10, 6))
    sns.barplot(x='Importance', y='Feature', data=importance_df)
    plt.title('Feature Importance')
    plt.show()

    print("\nTop 5 Factors Driving Sales Growth:")
    print(importance_df.head())

# 13. Insights Section
print("\n--- Insights ---\n")

# Model insights
print(f"🏆 Best performing model: {best_model_name}")
print(f"📈 Best R2 Score: {results[best_model_name]['R2 Score']:.2f}")
print(f"📉 Best RMSE: {results[best_model_name]['RMSE']:.2f}")

# Features insights
if best_model_name in ["Random Forest", "XGBoost"]:
    top_features = importance_df.head(5)
    print("\n🔑 Top 5 important features influencing car purchase amount:")
    for idx, row in top_features.iterrows():
        print(f"- {row['Feature']} (Importance: {row['Importance']:.4f})")

# General data insights
print("\n💡 General Insights:")
print("- Higher Annual Salary tends to strongly increase Car Purchase Amount.")
print("- Net Worth is another major factor: higher net worth, higher car purchase budgets.")
print("- Credit Card Debt negatively affects purchasing power but not as strongly as salary/net worth.")
print("- Gender seems less important than financial factors.")
print("- Age has moderate impact, but younger to middle-aged customers (25-45) showed higher spending.")

print("\n✅ End of Analysis.")


     customer name                                    customer e-mail  \
0    Martina Avila  cubilia.Curae.Phasellus@quisaccumsanconvallis.edu   
1    Harlan Barnes                                eu.dolor@diam.co.uk   
2  Naomi Rodriquez  vulputate.mauris.sagittis@ametconsectetueradip...   
3  Jade Cunningham                            malesuada@dignissim.com   
4     Cedric Leach     felis.ullamcorper.viverra@egetmollislectus.net   

        country  gender        age  annual salary  credit card debt  \
0      Bulgaria       0  41.851720    62812.09301      11609.380910   
1        Belize       0  40.870623    66646.89292       9572.957136   
2       Algeria       1  43.152897    53798.55112      11160.355060   
3  Cook Islands       1  58.271369    79370.03798      14426.164850   
4        Brazil       1  57.313749    59729.15130       5358.712177   

     net worth  car purchase amount  
0  238961.2505          35321.45877  
1  530973.9078          45115.52566  
2  638467.1773      