In [1]:
import pandas as pd
import numpy as np

In [2]:
shopping = pd.read_csv('/AnalysoinninPerusteetPython/shopping_behavior_updated.csv')

In [3]:
shopping.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases
0,1,55,Male,Blouse,Clothing,53,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14,Venmo,Fortnightly
1,2,19,Male,Sweater,Clothing,64,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2,Cash,Fortnightly
2,3,50,Male,Jeans,Clothing,73,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23,Credit Card,Weekly
3,4,21,Male,Sandals,Footwear,90,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49,PayPal,Weekly
4,5,45,Male,Blouse,Clothing,49,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,31,PayPal,Annually


In [4]:
shopping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3900 entries, 0 to 3899
Data columns (total 18 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Customer ID             3900 non-null   int64  
 1   Age                     3900 non-null   int64  
 2   Gender                  3900 non-null   object 
 3   Item Purchased          3900 non-null   object 
 4   Category                3900 non-null   object 
 5   Purchase Amount (USD)   3900 non-null   int64  
 6   Location                3900 non-null   object 
 7   Size                    3900 non-null   object 
 8   Color                   3900 non-null   object 
 9   Season                  3900 non-null   object 
 10  Review Rating           3900 non-null   float64
 11  Subscription Status     3900 non-null   object 
 12  Shipping Type           3900 non-null   object 
 13  Discount Applied        3900 non-null   object 
 14  Promo Code Used         3900 non-null   

### 1. Add missing values to the orginal dataset

In [5]:
# A seed for reproducibility
np.random.seed(42)

# Introduce Missing Values
missing_percentage = 0.05  # The percentage of missing values

missing_values_columns =['Age','Purchase Amount (USD)', 'Previous Purchases','Gender' ,'Location']
# df.select_dtypes(include=np.number).columns

for col in missing_values_columns:
    # Randomly select indices to introduce missing values
    missing_indices = np.random.choice(shopping.index, size=int(len(shopping) * missing_percentage), replace=False)
    shopping.loc[missing_indices, col] = np.nan

In [6]:
shopping.isna().sum()

Customer ID                 0
Age                       195
Gender                    195
Item Purchased              0
Category                    0
Purchase Amount (USD)     195
Location                  195
Size                        0
Color                       0
Season                      0
Review Rating               0
Subscription Status         0
Shipping Type               0
Discount Applied            0
Promo Code Used             0
Previous Purchases        195
Payment Method              0
Frequency of Purchases      0
dtype: int64

### 2. Add the Duplicated rows

In [7]:
# Duplicate 1% of the Rows
duplicate_percentage = 0.01  # The percentage of rows to be duplicated

# Randomly select 1% of the rows
duplicate_rows = shopping.sample(frac=duplicate_percentage, replace=True, random_state=42)

# Concatenate the original DataFrame with the duplicated rows
shopping = pd.concat([shopping, duplicate_rows], ignore_index=True)

In [8]:
shopping.duplicated().sum()

39

### 3. Add one Duplicated column

In [9]:
column_to_duplicate = 'Size' # Replace with the column name want to duplicate

# Duplicate the specified column
duplicated_column = shopping[column_to_duplicate].copy()

# Add the duplicated column to the original DataFrame
shopping['Size1'] = duplicated_column

In [10]:
shopping.shape

(3939, 19)

### 4. Add the outliers

In [11]:
shopping[['Purchase Amount (USD)', 'Previous Purchases']].plot.box()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

In [None]:
numerical_columns_outlier =['Purchase Amount (USD)', 'Previous Purchases']

# Introduce Outliers
outlier_percentage = 0.03  # The percentage of values to be replaced with outliers

for col in numerical_columns_outlier:
    # Randomly select indices to introduce outliers
    outlier_indices = np.random.choice(shopping.index, size=int(len(shopping) * outlier_percentage), replace=False)
    # Replace selected values with outliers (e.g., multiply by a factor)
    shopping.loc[outlier_indices, col] *= 5  # Adjust the factor as needed

In [None]:
shopping[['Purchase Amount (USD)', 'Previous Purchases']].plot.box()

ImportError: matplotlib is required for plotting when the default backend "matplotlib" is selected.

## 5. Add Date Column

In [None]:
start_date = '2020-01-01'
end_date = '2022-12-31'
date_range = pd.date_range(start=start_date, end=end_date, periods=len(shopping))
shopping['Date'] = date_range

In [None]:
# Verify the DataFrame with missing values, outliers, and the date column
shopping.head()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Size1,Date
0,1,55.0,Male,Blouse,Clothing,53.0,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14.0,Venmo,Fortnightly,L,2020-01-01 00:00:00.000000000
1,2,19.0,Male,Sweater,Clothing,64.0,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2.0,Cash,Fortnightly,L,2020-01-01 06:40:24.377856780
2,3,50.0,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23.0,Credit Card,Weekly,S,2020-01-01 13:20:48.755713560
3,4,21.0,Male,Sandals,Footwear,90.0,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49.0,PayPal,Weekly,M,2020-01-01 20:01:13.133570340
4,5,45.0,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,,PayPal,Annually,M,2020-01-02 02:41:37.511427120


In [None]:
shopping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3939 entries, 0 to 3938
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Customer ID             3939 non-null   int64         
 1   Age                     3744 non-null   float64       
 2   Gender                  3742 non-null   object        
 3   Item Purchased          3939 non-null   object        
 4   Category                3939 non-null   object        
 5   Purchase Amount (USD)   3744 non-null   float64       
 6   Location                3741 non-null   object        
 7   Size                    3939 non-null   object        
 8   Color                   3939 non-null   object        
 9   Season                  3939 non-null   object        
 10  Review Rating           3939 non-null   float64       
 11  Subscription Status     3939 non-null   object        
 12  Shipping Type           3939 non-null   object  

In [None]:
# shopping.to_csv("Shopping_V1.csv") //This is the modified dataset version with missing, added and edited data. You can save to csv file and use it if needed

In [None]:
shopping.columns

Index(['Customer ID', 'Age', 'Gender', 'Item Purchased', 'Category',
       'Purchase Amount (USD)', 'Location', 'Size', 'Color', 'Season',
       'Review Rating', 'Subscription Status', 'Shipping Type',
       'Discount Applied', 'Promo Code Used', 'Previous Purchases',
       'Payment Method', 'Frequency of Purchases', 'Size1', 'Date'],
      dtype='object')

In [None]:
shopping.index

RangeIndex(start=0, stop=3939, step=1)

In [None]:
shopping.shape

(3939, 20)

In [None]:
shopping.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3939 entries, 0 to 3938
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype         
---  ------                  --------------  -----         
 0   Customer ID             3939 non-null   int64         
 1   Age                     3744 non-null   float64       
 2   Gender                  3742 non-null   object        
 3   Item Purchased          3939 non-null   object        
 4   Category                3939 non-null   object        
 5   Purchase Amount (USD)   3744 non-null   float64       
 6   Location                3741 non-null   object        
 7   Size                    3939 non-null   object        
 8   Color                   3939 non-null   object        
 9   Season                  3939 non-null   object        
 10  Review Rating           3939 non-null   float64       
 11  Subscription Status     3939 non-null   object        
 12  Shipping Type           3939 non-null   object  

In [None]:
shopping['Age']

0       55.0
1       19.0
2       50.0
3       21.0
4       45.0
        ... 
3934    31.0
3935    18.0
3936    51.0
3937    41.0
3938    27.0
Name: Age, Length: 3939, dtype: float64

In [None]:
shopping.loc[[2,4,100]]

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Size1,Date
2,3,50.0,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23.0,Credit Card,Weekly,S,2020-01-01 13:20:48.755713560
4,5,45.0,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,,PayPal,Annually,M,2020-01-02 02:41:37.511427120
100,101,62.0,Male,Sunglasses,Accessories,98.0,,M,Maroon,Fall,2.7,Yes,Express,Yes,Yes,31.0,Cash,Fortnightly,M,2020-01-28 19:20:37.785678009


In [None]:
shopping.shape

(3939, 20)

In [None]:
shopping.dtypes

Customer ID                        int64
Age                              float64
Gender                            object
Item Purchased                    object
Category                          object
Purchase Amount (USD)            float64
Location                          object
Size                              object
Color                             object
Season                            object
Review Rating                    float64
Subscription Status               object
Shipping Type                     object
Discount Applied                  object
Promo Code Used                   object
Previous Purchases               float64
Payment Method                    object
Frequency of Purchases            object
Size1                             object
Date                      datetime64[ns]
dtype: object

In [None]:
missing_count = shopping.isnull().sum()
print(missing_count)

Customer ID                 0
Age                       195
Gender                    197
Item Purchased              0
Category                    0
Purchase Amount (USD)     195
Location                  198
Size                        0
Color                       0
Season                      0
Review Rating               0
Subscription Status         0
Shipping Type               0
Discount Applied            0
Promo Code Used             0
Previous Purchases        201
Payment Method              0
Frequency of Purchases      0
Size1                       0
Date                        0
dtype: int64


In [None]:
#Prosentti puuttuvaa dataa
total_data=np.product(shopping.shape)
total_missing=missing_count.sum()
per_of_missing_data=(total_missing/total_data)* 100
print(per_of_missing_data)


1.2515866971312515


In [None]:
shopping['Gender'].fillna("Missing_Gender", inplace =True)

In [None]:
shopping['Gender']

0                 Male
1                 Male
2                 Male
3                 Male
4                 Male
             ...      
3934    Missing_Gender
3935              Male
3936              Male
3937            Female
3938            Female
Name: Gender, Length: 3939, dtype: object

In [None]:
shopping['Age'] = shopping['Age'].astype(str)
print(shopping.dtypes)

Customer ID                        int64
Age                               object
Gender                            object
Item Purchased                    object
Category                          object
Purchase Amount (USD)            float64
Location                          object
Size                              object
Color                             object
Season                            object
Review Rating                    float64
Subscription Status               object
Shipping Type                     object
Discount Applied                  object
Promo Code Used                   object
Previous Purchases               float64
Payment Method                    object
Frequency of Purchases            object
Size1                             object
Date                      datetime64[ns]
dtype: object


In [None]:
shopping['Age'].fillna("Missing_Age", inplace =True)
shopping['Age']

0       55.0
1       19.0
2       50.0
3       21.0
4       45.0
        ... 
3934    31.0
3935    18.0
3936    51.0
3937    41.0
3938    27.0
Name: Age, Length: 3939, dtype: object

In [None]:
shopping['Purchase Amount (USD)'] = shopping['Purchase Amount (USD)'].astype(str)
print(shopping.dtypes)

Customer ID                        int64
Age                               object
Gender                            object
Item Purchased                    object
Category                          object
Purchase Amount (USD)             object
Location                          object
Size                              object
Color                             object
Season                            object
Review Rating                    float64
Subscription Status               object
Shipping Type                     object
Discount Applied                  object
Promo Code Used                   object
Previous Purchases               float64
Payment Method                    object
Frequency of Purchases            object
Size1                             object
Date                      datetime64[ns]
dtype: object


In [None]:
shopping['Purchase Amount (USD)'].fillna("Missing_Purchase", inplace =True)
shopping['Purchase Amount (USD)']

0       53.0
1       64.0
2       73.0
3       90.0
4       49.0
        ... 
3934    62.0
3935    88.0
3936    74.0
3937    41.0
3938    73.0
Name: Purchase Amount (USD), Length: 3939, dtype: object

In [None]:
shopping['Location'].fillna("Missing_Location", inplace =True)
shopping['Location']

0             Kentucky
1                Maine
2        Massachusetts
3         Rhode Island
4               Oregon
             ...      
3934    North Carolina
3935      Pennsylvania
3936          Oklahoma
3937              Iowa
3938              Utah
Name: Location, Length: 3939, dtype: object

In [None]:
shopping['Previous Purchases'] = shopping['Previous Purchases'].astype(str)
print(shopping.dtypes)

Customer ID                        int64
Age                               object
Gender                            object
Item Purchased                    object
Category                          object
Purchase Amount (USD)             object
Location                          object
Size                              object
Color                             object
Season                            object
Review Rating                    float64
Subscription Status               object
Shipping Type                     object
Discount Applied                  object
Promo Code Used                   object
Previous Purchases                object
Payment Method                    object
Frequency of Purchases            object
Size1                             object
Date                      datetime64[ns]
dtype: object


In [None]:
shopping['Previous Purchases'].fillna("Missing_Previous_Purchases", inplace =True)
shopping['Previous Purchases']

0       14.0
1        2.0
2       23.0
3       49.0
4        nan
        ... 
3934    22.0
3935    38.0
3936     nan
3937    43.0
3938     nan
Name: Previous Purchases, Length: 3939, dtype: object

In [None]:
print(shopping['Previous Purchases'].isnull().sum())

0


In [None]:
duplicate_rows = shopping.duplicated(subset=['Customer ID'])
print(duplicate_rows)

0       False
1       False
2       False
3       False
4       False
        ...  
3934     True
3935     True
3936     True
3937     True
3938     True
Length: 3939, dtype: bool


In [None]:
shopping.drop_duplicates(subset=['Customer ID'], keep='first')

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Size1
0,1,55.0,Male,Blouse,Clothing,53.0,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14.0,Venmo,Fortnightly,L
1,2,19.0,Male,Sweater,Clothing,64.0,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2.0,Cash,Fortnightly,L
2,3,50.0,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23.0,Credit Card,Weekly,S
3,4,21.0,Male,Sandals,Footwear,90.0,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49.0,PayPal,Weekly,M
4,5,45.0,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,,PayPal,Annually,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3895,3896,40.0,Female,Hoodie,Clothing,28.0,Virginia,L,Turquoise,Summer,4.2,No,2-Day Shipping,No,No,32.0,Venmo,Weekly,L
3896,3897,52.0,Female,Backpack,Accessories,49.0,Iowa,L,White,Spring,4.5,No,Store Pickup,No,No,41.0,Bank Transfer,Bi-Weekly,L
3897,3898,46.0,Female,Belt,Accessories,33.0,New Jersey,L,Green,Spring,2.9,No,Standard,No,No,24.0,Venmo,Quarterly,L
3898,3899,44.0,Female,Shoes,Footwear,77.0,Minnesota,S,Brown,Summer,3.8,No,Express,No,No,24.0,Venmo,Weekly,S


In [None]:
shopping.notnull()

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Size1
0,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
1,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
2,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
4,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3934,True,True,False,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3935,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True
3936,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,False,True,True,True
3937,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True,True


In [None]:
shopping.duplicated().sum()

39

In [None]:
shopping['Age']

0       55.0
1       19.0
2       50.0
3       21.0
4       45.0
        ... 
3934    31.0
3935    18.0
3936    51.0
3937    41.0
3938    27.0
Name: Age, Length: 3939, dtype: float64

In [None]:
shopping.loc[(shopping['Age'] == 70.0)]

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Size1
39,40,70.0,Male,Pants,Clothing,60.0,,S,Turquoise,Summer,4.2,Yes,Express,Yes,Yes,18.0,Credit Card,Monthly,S
108,109,70.0,Male,Socks,Clothing,79.0,Montana,L,Purple,Spring,3.4,Yes,Next Day Air,Yes,Yes,32.0,Bank Transfer,Bi-Weekly,L
114,115,70.0,Male,Coat,Outerwear,95.0,Massachusetts,S,Blue,Summer,3.7,Yes,Store Pickup,Yes,Yes,9.0,Cash,Weekly,S
203,204,70.0,Male,Scarf,Accessories,38.0,Indiana,S,Yellow,Fall,3.2,Yes,Next Day Air,Yes,Yes,18.0,Bank Transfer,Weekly,S
294,295,70.0,Male,Skirt,Clothing,20.0,New Jersey,M,Gold,Spring,4.6,Yes,Store Pickup,Yes,Yes,19.0,Credit Card,Every 3 Months,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3571,3572,70.0,Female,Skirt,Clothing,135.0,North Dakota,M,White,Summer,3.2,No,Standard,No,No,15.0,Bank Transfer,Every 3 Months,M
3675,3676,70.0,Female,Hat,Accessories,80.0,Virginia,M,White,Winter,4.5,No,Express,No,No,3.0,Bank Transfer,Monthly,M
3802,3803,70.0,Female,Scarf,Accessories,72.0,Illinois,S,Turquoise,Spring,3.2,No,2-Day Shipping,No,No,1.0,Bank Transfer,Monthly,S
3818,3819,70.0,Female,Sneakers,Footwear,41.0,Oregon,XL,Indigo,Winter,3.8,No,Free Shipping,No,No,42.0,Cash,Monthly,XL


In [12]:
ages = [18, 19, 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]
# to divide them into bins of 20 - 25, 26 - 35 , 36- 45 & > 46
# we will use cut function
bins = [18,29,30,45,46,70 ]
age_bins = pd.cut(ages,bins)
# to chek the categories
age_bins.categories
#lets count the values for each bin
pd.value_counts(age_bins)



  pd.value_counts(age_bins)


(46, 70]    24
(30, 45]    15
(18, 29]    11
(29, 30]     1
(45, 46]     1
Name: count, dtype: int64

In [13]:
gnames = ['Young', 'YoundAdult', 'Adult','MiddleAged','Elderly']
bins_labels = pd.cut(ages, bins, labels=gnames)
bins_labels.categories

Index(['Young', 'YoundAdult', 'Adult', 'MiddleAged', 'Elderly'], dtype='object')

In [14]:
shopping.loc[(shopping['Age'] >= 18.0)]

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Size1
0,1,55.0,Male,Blouse,Clothing,53.0,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14.0,Venmo,Fortnightly,L
1,2,19.0,Male,Sweater,Clothing,64.0,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2.0,Cash,Fortnightly,L
2,3,50.0,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23.0,Credit Card,Weekly,S
3,4,21.0,Male,Sandals,Footwear,90.0,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49.0,PayPal,Weekly,M
4,5,45.0,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,,PayPal,Annually,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3934,22,31.0,,Pants,Clothing,62.0,North Carolina,M,Charcoal,Winter,4.1,Yes,Store Pickup,Yes,Yes,22.0,Debit Card,Quarterly,M
3935,2301,18.0,Male,Coat,Outerwear,88.0,Pennsylvania,XL,Lavender,Summer,4.4,No,2-Day Shipping,No,No,38.0,Credit Card,Monthly,XL
3936,748,51.0,Male,Shorts,Clothing,74.0,Oklahoma,L,Gold,Spring,4.4,Yes,Next Day Air,Yes,Yes,,Bank Transfer,Weekly,L
3937,2905,41.0,Female,Sandals,Footwear,41.0,Iowa,M,Gray,Spring,4.0,No,2-Day Shipping,No,No,43.0,Venmo,Monthly,M


In [None]:
shopping

Unnamed: 0,Customer ID,Age,Gender,Item Purchased,Category,Purchase Amount (USD),Location,Size,Color,Season,Review Rating,Subscription Status,Shipping Type,Discount Applied,Promo Code Used,Previous Purchases,Payment Method,Frequency of Purchases,Size1
0,1,55.0,Male,Blouse,Clothing,53.0,Kentucky,L,Gray,Winter,3.1,Yes,Express,Yes,Yes,14.0,Venmo,Fortnightly,L
1,2,19.0,Male,Sweater,Clothing,64.0,Maine,L,Maroon,Winter,3.1,Yes,Express,Yes,Yes,2.0,Cash,Fortnightly,L
2,3,50.0,Male,Jeans,Clothing,73.0,Massachusetts,S,Maroon,Spring,3.1,Yes,Free Shipping,Yes,Yes,23.0,Credit Card,Weekly,S
3,4,21.0,Male,Sandals,Footwear,90.0,Rhode Island,M,Maroon,Spring,3.5,Yes,Next Day Air,Yes,Yes,49.0,PayPal,Weekly,M
4,5,45.0,Male,Blouse,Clothing,49.0,Oregon,M,Turquoise,Spring,2.7,Yes,Free Shipping,Yes,Yes,,PayPal,Annually,M
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3934,22,31.0,,Pants,Clothing,62.0,North Carolina,M,Charcoal,Winter,4.1,Yes,Store Pickup,Yes,Yes,22.0,Debit Card,Quarterly,M
3935,2301,18.0,Male,Coat,Outerwear,88.0,Pennsylvania,XL,Lavender,Summer,4.4,No,2-Day Shipping,No,No,38.0,Credit Card,Monthly,XL
3936,748,51.0,Male,Shorts,Clothing,74.0,Oklahoma,L,Gold,Spring,4.4,Yes,Next Day Air,Yes,Yes,,Bank Transfer,Weekly,L
3937,2905,41.0,Female,Sandals,Footwear,41.0,Iowa,M,Gray,Spring,4.0,No,2-Day Shipping,No,No,43.0,Venmo,Monthly,M


In [27]:
shopping = {'Age':[18, 19, 20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68,69,70]}
df_ages = pd.DataFrame(shopping)
# to divide them into bins of 20 - 25, 26 - 35 , 36- 45 & > 46, define the bin edges
bins = [18,30,40,50,60,70]

# lets create lables for each bin
labels = ['YoungAdult', 'Adult', 'MiddleAged', 'Elderly', 'Retired']

# we will use cut function to bin
df_ages['AgeGroup'] = pd.cut(df_ages['Age'], bins=bins, labels=labels, right=False)

# Get the list of categories
categories_list = df_ages['AgeGroup'].cat.categories.tolist()
print(categories_list)

# Count the values in each bin
bin_counts = df_ages['AgeGroup'].value_counts()
print(bin_counts)

['YoungAdult', 'Adult', 'MiddleAged', 'Elderly', 'Retired']
AgeGroup
YoungAdult    12
Adult         10
MiddleAged    10
Elderly       10
Retired       10
Name: count, dtype: int64
