In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load the dataset

In [None]:
df = pd.read_csv("car_purchasing.csv", encoding='ISO-8859-1')

In [None]:
df

# EDA

Display the first few rows

In [None]:
print(df.head())

Check for missing values

In [None]:
print(df.isnull().sum())

In [None]:
print(df.isnull().sum().sum())


Summary statistics

In [None]:
print(df.describe())

# Data visualization

Correlation heatmap

In [None]:
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap="coolwarm")
plt.show()



Histograms for numeric features



In [None]:
numeric_features = ["age", "annual Salary", "credit card debt", "net worth"]
for feature in numeric_features:
    plt.figure(figsize=(8, 5))
    sns.histplot(df[feature], bins=20, kde=True)
    plt.title(f"{feature} Distribution")
    plt.xlabel(feature)
    plt.ylabel("Frequency")
    plt.show()


Box plots for numeric features

In [None]:

for feature in numeric_features:
    plt.figure(figsize=(8, 5))
    sns.boxplot(x=feature, data=df)
    plt.title(f"{feature} Outliers")
    plt.xlabel(feature)
    plt.show()


Count plots for categorical features

In [None]:
categorical_features = ["country", "gender"]
for feature in categorical_features:
    plt.figure(figsize=(8, 5))
    sns.countplot(x=feature, data=df)
    plt.title(f"{feature} Distribution")
    plt.xlabel(feature)
    plt.ylabel("Count")
    plt.show()


Scatter plots for numeric features vs. car purchase amount

In [None]:
for feature in numeric_features:
    plt.figure(figsize=(8, 5))
    sns.scatterplot(x=feature, y="car purchase amount", data=df)
    plt.title(f"{feature} vs. Car Purchase Amount")
    plt.xlabel(feature)
    plt.ylabel("Car Purchase Amount")
    plt.show()


Model Data

In [None]:
X = df[["age", "annual Salary", "credit card debt", "net worth"]]

In [None]:
X

In [None]:
y = df["car purchase amount"]

In [None]:
y

Split the data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model Building and Training

In [None]:
model = LinearRegression()

In [None]:
model.fit(X_train, y_train)

# Model Evaluation

In [None]:
y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print("Mean Absolute Error:", mae)
print("Mean Squared Error:", mse)
print("R-squared:", r2)

# Making Predictions

In [None]:
new_customer_data = [[35, 80000, 2000, 90000]]  # Age, Annual Salary, Credit Card Debt, Net Worth
predicted_purchase_amount = model.predict(new_customer_data)
print("Predicted Car Purchase Amount:", predicted_purchase_amount)