In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

In [2]:
dataset1 = pd.read_csv('Product_Offering_Data.csv')

In [3]:
print(dataset1.isnull().sum())

Product_ID              0
Product_Name            0
Product_Type            0
Risk_Level              0
Target_Age_Group       15
Target_Income_Group     0
dtype: int64


In [4]:
print(dataset1['Target_Age_Group'].unique()) 
print(dataset1['Target_Age_Group'].value_counts())

[nan]
Series([], Name: count, dtype: int64)


In [5]:
# Verify the column exists before attempting to drop
if 'Target_Age_Group' in dataset1.columns:
    # Drop the column if it exists
    dataset1.drop(columns=['Target_Age_Group'], inplace=True)
else:
    print("Column 'Target_Age_Group' does not exist in the DataFrame.")

# Display the dataset structure to check the changes
print(dataset1.head())

   Product_ID                   Product_Name     Product_Type Risk_Level  \
0           1           Platinum Credit Card      Credit Card     Medium   
1           2           Gold Savings Account  Savings Account        Low   
2           3  High-Yield Investment Account       Investment       High   
3           4                  Mortgage Loan             Loan     Medium   
4           5                      Auto Loan             Loan     Medium   

  Target_Income_Group  
0              Medium  
1                 Low  
2                High  
3                High  
4              Medium  


The column `Target_Age_Group` has been dropped as it contained only missing values. 

In [6]:
dataset1.drop_duplicates(inplace=True)
dataset1

Unnamed: 0,Product_ID,Product_Name,Product_Type,Risk_Level,Target_Income_Group
0,1,Platinum Credit Card,Credit Card,Medium,Medium
1,2,Gold Savings Account,Savings Account,Low,Low
2,3,High-Yield Investment Account,Investment,High,High
3,4,Mortgage Loan,Loan,Medium,High
4,5,Auto Loan,Loan,Medium,Medium
5,6,Personal Loan,Loan,Medium,Low
6,7,Youth Savings Account,Savings Account,Low,Low
7,8,Retirement Investment Fund,Investment,High,High
8,9,Business Loan,Loan,Medium,Medium
9,10,Travel Credit Card,Credit Card,Medium,Medium


Duplicate rows have been removed from the dataset.

In [7]:
# Apply Label Encoding to categorical columns
label_encoder = LabelEncoder()
categorical_columns = ['Product_Name', 'Product_Type', 'Risk_Level', 'Target_Income_Group']

for column in categorical_columns:
    dataset1[column] = label_encoder.fit_transform(dataset1[column])

dataset1

Unnamed: 0,Product_ID,Product_Name,Product_Type,Risk_Level,Target_Income_Group
0,1,6,0,2,2
1,2,2,3,1,1
2,3,3,1,0,0
3,4,4,2,2,0
4,5,0,2,2,2
5,6,5,2,2,1
6,7,9,3,1,1
7,8,7,1,0,0
8,9,1,2,2,2
9,10,8,0,2,2


Label Encoding has been applied to the categorical columns in the dataset.

In [8]:
# Creating new features based on existing data
# Feature 1: Product Popularity Score (based on Risk_Level and Target_Income_Group)
dataset1['Product_Popularity_Score'] = dataset1['Risk_Level'] + dataset1['Target_Income_Group']

# Feature 2: Product Category (combining Product_Type and Risk_Level)
dataset1['Product_Category'] = dataset1['Product_Type'].astype(str) + '_' + dataset1['Risk_Level'].astype(str)

# Display the updated dataset
dataset1

Unnamed: 0,Product_ID,Product_Name,Product_Type,Risk_Level,Target_Income_Group,Product_Popularity_Score,Product_Category
0,1,6,0,2,2,4,0_2
1,2,2,3,1,1,2,3_1
2,3,3,1,0,0,0,1_0
3,4,4,2,2,0,2,2_2
4,5,0,2,2,2,4,2_2
5,6,5,2,2,1,3,2_2
6,7,9,3,1,1,2,3_1
7,8,7,1,0,0,0,1_0
8,9,1,2,2,2,4,2_2
9,10,8,0,2,2,4,0_2


New features have been created: 
1. `Product_Popularity_Score` - a score based on the sum of `Risk_Level` and `Target_Income_Group`.
2. `Product_Category` - a combination of `Product_Type` and `Risk_Level`.

In [9]:
# Splitting the dataset into features (X) and target (y)
# Assuming 'Product_Popularity_Score' is the target variable
y = dataset1['Product_Popularity_Score']
X = dataset1.drop(columns=['Product_Popularity_Score'])

X, y

(   Product_ID  Product_Name  Product_Type  Risk_Level  Target_Income_Group  \
 0           1             6             0           2                    2   
 1           2             2             3           1                    1   
 2           3             3             1           0                    0   
 3           4             4             2           2                    0   
 4           5             0             2           2                    2   
 5           6             5             2           2                    1   
 6           7             9             3           1                    1   
 7           8             7             1           0                    0   
 8           9             1             2           2                    2   
 9          10             8             0           2                    2   
 
   Product_Category  
 0              0_2  
 1              3_1  
 2              1_0  
 3              2_2  
 4              2_2

The dataset has been successfully split into features (X) and target (y).

1. **Data Cleaning**:
   - The column `Target_Age_Group` was dropped because it contained only missing values.
   - Duplicate rows were removed to ensure data quality.

2. **Feature Engineering**:
   - Label encoding was applied to categorical columns (`Product_Name`, `Product_Type`, `Risk_Level`, and `Target_Income_Group`) to convert them into numerical format.
   - Two new features were created:
     - `Product_Popularity_Score`: A sum of `Risk_Level` and `Target_Income_Group`.
     - `Product_Category`: A combination of `Product_Type` and `Risk_Level`.

3. **Dataset Splitting**:
   - The dataset was split into features (`X`) and target (`y`), with `Product_Popularity_Score` as the target variable.

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=9f7438eb-a69f-4bf2-a212-4a135e641139' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>