In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))
        
import warnings 
warnings.filterwarnings('ignore')

#### **Data Description**¶
This dataset contains information about individuals and their response to a particular advertisement campaign on social media. The dataset includes the following columns:

- Age: Age of the individual in years.
- EstimatedSalary: Estimated salary of the individual.
- Purchased: Binary variable indicating whether the individual made a purchase (1) or not (0) after seeing the advertisement.


#### **Column Descriptions**
- Age:

Data Type: Integer Description: Represents the age of the individual in years.

- EstimatedSalary:

Data Type: Integer Description: Indicates the estimated salary of the individual.

- Purchased:

Data Type: Integer (0 or 1) Description: Indicates whether the individual made a purchase (1) or not (0) after seeing the advertisement.

In [None]:
data = pd.read_csv('/kaggle/input/social-advertisement-dataset/social_ads.csv')
data.head(5)

In [None]:
data.info()

In [None]:
# Check for missing values
data.isnull().sum()

### There are no missing value

In [None]:
# Mapping data
data['Purchased_cate'] = data['Purchased'].replace({0: 'Purchased', 1: 'Not Purchased'} )

In [None]:
data.describe().T

**ABOUT AGE**
1. Minimum age is 18 years old.
2. Maximum age is 60 years old.
3. Average age is 37 years old.


**ABOUT Estimated Salary**
1. Minimum salary is 15000.
2. Maximum salary is 150000.
3. Average salary is around 70000.

## Generate generation of customers
Minimum age is 18 years old. So, we can know that the customers is in Gen Z, Millennials, Gen X and Boomers II

In [None]:
def classify_generation(age):
    if age >= 12 and age <= 27:
        return "Gen Z"
    elif age >= 28 and age <= 43:
        return "Gen Y"
    elif age >= 44 and age <= 59:
        return "Gen X"
    elif age >= 60 and age <= 69:
        return "Boomers II"

data['Generation'] = data['Age'].apply(classify_generation)

In [None]:
data.head(10)

In [None]:
data.info()

In [None]:
# Group by 'Generation' and calculate the sum of 'Purchased'
grouped_data = data.groupby('Generation').agg({
    'EstimatedSalary': 'mean',
    'Generation': 'count',
    'Purchased_cate': lambda x: (x == "Purchased").sum()
})

# Round 'EstimatedSalary' to two decimal places (optional)
grouped_data['EstimatedSalary'] = grouped_data['EstimatedSalary'].round(2)

# Display the results
print(grouped_data)

#### Estimated Salary of Customers for each Generation
- Estimated Salary of Boomers II is around 65286 from 7 people
- Estimated Salary of Gen X is around 72810 from 105 people
- Estimated Salary of Gen Z is around 57038 from 78 people
- Estimated Salary of Millennials is around 73076 from 210 people

In [None]:
d = sns.catplot(x='Generation', y='EstimatedSalary', kind='box', data=data)
d.figure.set_size_inches(5, 5)
d.figure.suptitle('EstimatedSalary by Generation')
plt.show()

In [None]:
#https://stackoverflow.com/a/71515035/2901002
def autopct_format(values):
        def my_format(pct):
            total = sum(values)
            val = int(round(pct*total/100.0))
            return '{:.1f}%\n({v:d})'.format(pct, v=val)
        return my_format
pie_labels = ['Purchased', 'Not Purchased']
s = data["Purchased"].value_counts()

In [None]:
fig,ax = plt.subplots(1,2, figsize=(16,5))
plots = [
            sns.histplot(data["EstimatedSalary"], kde=True, ax=ax[0]),
            plt.pie(s,labels = s.index, explode = [0,0.08], shadow = True, autopct=autopct_format(s)),
        ]

https://matplotlib.org/stable/gallery/pie_and_polar_charts/bar_of_pie.html#sphx-glr-gallery-pie-and-polar-charts-bar-of-pie-py


In [None]:
fig,ax = plt.subplots(1,2, figsize=(16,5))
plots = [
    sns.histplot(data=data, x="Age", hue="Purchased_cate", kde=True,ax=ax[0]),
    sns.histplot(data=data, x="Generation", hue="Purchased_cate", ax=ax[1])
]

In [None]:
sns.histplot(data=data, x="EstimatedSalary", hue="Purchased", kde=True)

In [None]:
corr_matrix = data.select_dtypes(include=[np.number]).corr()

plt.figure(figsize=(6, 5))
sns.heatmap(corr_matrix, annot=True, cmap='Pastel1')
plt.title("Correlation between Numerical Features")
plt.show()

### Insight from EDA
- There is no outlier in generation distributino compare to estimated salary
- Gen Y (28-43 years old) has highest making purchases compared to other generations.
- Age or Generation can be a significant purchase behavior refer to correlation.
- Higher Age has more chance to not making puchase.
- Estimated salary shows a weaker correlation with purchase behavior compared to age

# Clustering using K-Means

In [None]:
plt.scatter(data["Age"], data["EstimatedSalary"])
plt.show()

In [None]:
from sklearn.cluster import KMeans
input_df = data[['Age', 'EstimatedSalary']]
data = list(zip(data["Age"], data["EstimatedSalary"]))
inertias = []

In [None]:
for i in range(1,11):
    kmeans = KMeans(n_clusters=i)
    kmeans.fit(data)
    inertias.append(kmeans.inertia_)

plt.plot(range(1,11), inertias, marker='o')
plt.title('Elbow method')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

In [None]:
model = KMeans(n_clusters=4, random_state=42)
model.fit(input_df[['Age', 'EstimatedSalary']])

In [None]:
# Get the cluster labels after fitting the model
cluster_labels = model.predict(input_df[['Age', 'EstimatedSalary']])

# Create a scatter plot using colors based on cluster labels
sns.scatterplot(
    x=input_df['Age'],
    y=input_df['EstimatedSalary'],
    hue=cluster_labels,
    palette='tab10',  # Choose a color palette (adjust as needed)
    alpha=0.7,  # Set transparency for better visibility (optional)
)

# Add labels and title
plt.xlabel('Age')
plt.ylabel('Estimated Salary')
plt.title('Age vs. Estimated Salary with KMeans Clusters (4 Clusters)')

# Show the plot
plt.show()