In [4]:
import pandas as pd
import numpy as np


In [5]:
df = pd.read_csv(r"C:\Users\Vijay\Downloads\car_purchasing.csv", encoding='ISO-8859-1')


In [6]:
print(df.head())


     customer name                                    customer e-mail  \
0    Martina Avila  cubilia.Curae.Phasellus@quisaccumsanconvallis.edu   
1    Harlan Barnes                                eu.dolor@diam.co.uk   
2  Naomi Rodriquez  vulputate.mauris.sagittis@ametconsectetueradip...   
3  Jade Cunningham                            malesuada@dignissim.com   
4     Cedric Leach     felis.ullamcorper.viverra@egetmollislectus.net   

        country  gender        age  annual Salary  credit card debt  \
0      Bulgaria       0  41.851720    62812.09301      11609.380910   
1        Belize       0  40.870623    66646.89292       9572.957136   
2       Algeria       1  43.152897    53798.55112      11160.355060   
3  Cook Islands       1  58.271369    79370.03798      14426.164850   
4        Brazil       1  57.313749    59729.15130       5358.712177   

     net worth  car purchase amount  
0  238961.2505          35321.45877  
1  530973.9078          45115.52566  
2  638467.1773      

In [7]:
print(df.info())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   customer name        500 non-null    object 
 1   customer e-mail      500 non-null    object 
 2   country              500 non-null    object 
 3   gender               500 non-null    int64  
 4   age                  500 non-null    float64
 5   annual Salary        500 non-null    float64
 6   credit card debt     500 non-null    float64
 7   net worth            500 non-null    float64
 8   car purchase amount  500 non-null    float64
dtypes: float64(5), int64(1), object(3)
memory usage: 35.3+ KB
None


In [8]:
print(df.describe())


           gender         age  annual Salary  credit card debt  \
count  500.000000  500.000000     500.000000        500.000000   
mean     0.506000   46.241674   62127.239608       9607.645049   
std      0.500465    7.978862   11703.378228       3489.187973   
min      0.000000   20.000000   20000.000000        100.000000   
25%      0.000000   40.949969   54391.977195       7397.515792   
50%      1.000000   46.049901   62915.497035       9655.035568   
75%      1.000000   51.612263   70117.862005      11798.867487   
max      1.000000   70.000000  100000.000000      20000.000000   

            net worth  car purchase amount  
count      500.000000           500.000000  
mean    431475.713625         44209.799218  
std     173536.756340         10773.178744  
min      20000.000000          9000.000000  
25%     299824.195900         37629.896040  
50%     426750.120650         43997.783390  
75%     557324.478725         51254.709517  
max    1000000.000000         80000.000000  


In [9]:
print(df.isnull().sum())


customer name          0
customer e-mail        0
country                0
gender                 0
age                    0
annual Salary          0
credit card debt       0
net worth              0
car purchase amount    0
dtype: int64


In [11]:
from sklearn.preprocessing import LabelEncoder

for col in df.select_dtypes(include='object').columns:
    df[col] = LabelEncoder().fit_transform(df[col])


In [12]:
from scipy.stats import zscore
df = df[(np.abs(zscore(df)) < 3).all(axis=1)]


In [14]:
X = df.drop('car purchase amount', axis=1)
y = df['car purchase amount']


In [15]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)


In [16]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)


In [17]:
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


RandomForestRegressor(random_state=42)

In [21]:
y_pred = model.predict(X_test)

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

print("R² Score:", r2_score(y_test, y_pred))
print("MAE:", mean_absolute_error(y_test, y_pred))
print("RMSE:", np.sqrt(mean_squared_error(y_test, y_pred)))


R² Score: 0.9428105666696649
MAE: 1896.9604100696922
RMSE: 2466.675880087966


In [19]:
df['Predicted Purchase Amount'] = model.predict(scaler.transform(X))
df['Customer Segment'] = pd.cut(df['Predicted Purchase Amount'], 
                                bins=[0, 20000, 40000, 60000],
                                labels=['Low Spender', 'Mid Spender', 'High Spender'])


In [20]:
def recommend_strategy(segment):
    if segment == 'Low Spender':
        return 'Offer discounts, highlight affordability'
    elif segment == 'Mid Spender':
        return 'Use trade-in offers or financing'
    elif segment == 'High Spender':
        return 'Target with luxury add-ons and upgrades'
    
df['Marketing Strategy'] = df['Customer Segment'].apply(recommend_strategy)


In [22]:
print(df[['Customer Segment', 'Marketing Strategy']])


    Customer Segment                       Marketing Strategy
0        Mid Spender         Use trade-in offers or financing
1       High Spender  Target with luxury add-ons and upgrades
2       High Spender  Target with luxury add-ons and upgrades
3                NaN                                      NaN
4       High Spender  Target with luxury add-ons and upgrades
..               ...                                      ...
495     High Spender  Target with luxury add-ons and upgrades
496      Mid Spender         Use trade-in offers or financing
497              NaN                                      NaN
498     High Spender  Target with luxury add-ons and upgrades
499     High Spender  Target with luxury add-ons and upgrades

[494 rows x 2 columns]
