<a href="https://colab.research.google.com/github/SobiaNoorAI/Car-Purchasing-Prediction-Machine-Learning-Model/blob/main/Car_Purchasing_Prediction_ML_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Purchasing Prediction Model using Logistic Regression 🚀


# 1️⃣ Problem Statement

Predict whether a customer will make a purchase based on their demographics and online behavior.

#2️⃣ Import Required Libraries

In [42]:
# import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

#3️⃣ Load & Explore Dataset

In [2]:
# Load dataset (Assuming a CSV file)
car_df = pd.read_csv("https://raw.githubusercontent.com/SobiaNoorAI/Car-Purchasing-Prediction-Machine-Learning-Model/main/Data/car_data.csv")

# Display first few rows
print(car_df.head())

    User ID  Gender  Age  EstimatedSalary  Purchased
0  15624510    Male   19            19000          0
1  15810944    Male   35            20000          0
2  15668575  Female   26            43000          0
3  15603246  Female   27            57000          0
4  15804002    Male   19            76000          0


## Data cleaning

In [3]:
# Check for missing values
print(car_df.isnull().sum())

User ID            0
Gender             0
Age                0
EstimatedSalary    0
Purchased          0
dtype: int64


In [4]:
# Check for duplicate values
print(car_df.duplicated().sum())

0


## Explore Data

In [5]:
# info of data like data types
car_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   User ID          400 non-null    int64 
 1   Gender           400 non-null    object
 2   Age              400 non-null    int64 
 3   EstimatedSalary  400 non-null    int64 
 4   Purchased        400 non-null    int64 
dtypes: int64(4), object(1)
memory usage: 15.8+ KB


In [6]:
#rows and columns
car_df.shape

(400, 5)

#4️⃣ Preprocessing Data

## Extract Input, Output Variables

In [23]:
# Convert categorical variables (if any)
car_df = pd.get_dummies(car_df, drop_first=True)

# Define features and target variable
# Features or Input data or Independent Variable
x = car_df.drop("Purchased", axis=1)

# Target(1 = Purchased, 0 = Not Purchased) or Output data or Dependent Variable
y = car_df["Purchased"]

# Check if Purchased Column is removed
x.head()

Unnamed: 0,User ID,Age,EstimatedSalary,Gender_Male
0,15624510,19,19000,True
1,15810944,35,20000,True
2,15668575,26,43000,False
3,15603246,27,57000,False
4,15804002,19,76000,True


In [25]:
# Check Purchased Column values
y.head()

Unnamed: 0,Purchased
0,0
1,0
2,0
3,0
4,0


## Split Dataset

In [26]:
# Train-test split (70-30 ratio)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1)
x_train.shape, x_test.shape

((280, 4), (120, 4))

## Scaling

In [32]:
# Standardizing numerical features
scaler = StandardScaler()
x_train = scaler.fit_transform(x_train)
x_test = scaler.transform(x_test)
# To 5 Rows
x_test[:5]

array([[ 0.81998418, -0.1066347 , -1.0522259 ,  1.00716855],
       [ 0.02043836,  0.18418721, -0.23512496, -0.99288247],
       [ 1.38979052, -0.1066347 ,  1.42825909,  1.00716855],
       [-0.41050413,  0.18418721,  1.54498779,  1.00716855],
       [ 1.3664926 , -1.07604106,  1.42825909, -0.99288247]])

#5️⃣ Train Model with Logistic Regression

In [41]:
# Initialize and train the model
model = LogisticRegression()
model.fit(x_train, y_train)