### Import Required Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

### Create Sample Data

In [2]:
data = {
    'Product_ID': ['P001','P002','P003','P004','P005','P006','P007','P008','P009','P010'],
    'Product_Category': ['Electronics','Clothing','Electronics','Clothing','Books','Books','Electronics','Clothing','Books','Electronics'],
    'Customer_Location': ['Delhi','Mumbai','Bangalore','Delhi','Chennai','Mumbai','Bangalore','Delhi','Chennai','Mumbai'],
    'Price': [5000, 1200, 7000, 1500, 300, 450, 6000, 1300, 200, 8000],
    'Discount': [10, 5, 15, 20, 0, 10, 5, 15, 0, 20],
    'Purchase_Frequency': [3, 1, 5, 2, 4, 2, 6, 3, 1, 7],
    'Review_Rating': [4.5, 4.0, np.nan, 3.5, 5.0, 4.2, 4.8, 3.8, 4.5, np.nan],
    'Units_Sold': [100, 150, 80, 200, 500, 400, 90, 180, 300, 70]
}

df = pd.DataFrame(data)
df

Unnamed: 0,Product_ID,Product_Category,Customer_Location,Price,Discount,Purchase_Frequency,Review_Rating,Units_Sold
0,P001,Electronics,Delhi,5000,10,3,4.5,100
1,P002,Clothing,Mumbai,1200,5,1,4.0,150
2,P003,Electronics,Bangalore,7000,15,5,,80
3,P004,Clothing,Delhi,1500,20,2,3.5,200
4,P005,Books,Chennai,300,0,4,5.0,500
5,P006,Books,Mumbai,450,10,2,4.2,400
6,P007,Electronics,Bangalore,6000,5,6,4.8,90
7,P008,Clothing,Delhi,1300,15,3,3.8,180
8,P009,Books,Chennai,200,0,1,4.5,300
9,P010,Electronics,Mumbai,8000,20,7,,70


### Exploratory Data Analysis (EDA)

In [3]:
# Basic Info
print(df.info())

# Summary statistics
print(df.describe())

# Check missing values
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 8 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Product_ID          10 non-null     object 
 1   Product_Category    10 non-null     object 
 2   Customer_Location   10 non-null     object 
 3   Price               10 non-null     int64  
 4   Discount            10 non-null     int64  
 5   Purchase_Frequency  10 non-null     int64  
 6   Review_Rating       8 non-null      float64
 7   Units_Sold          10 non-null     int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 772.0+ bytes
None
             Price  Discount  Purchase_Frequency  Review_Rating  Units_Sold
count    10.000000  10.00000           10.000000       8.000000   10.000000
mean   3095.000000  10.00000            3.400000       4.287500  207.000000
std    3053.636266   7.45356            2.065591       0.508324  147.501412
min     200.000000   0.00000     

### Handle Missing Values

In [4]:
# Fill missing review ratings with mean
df['Review_Rating'].fillna(df['Review_Rating'].mean(), inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Review_Rating'].fillna(df['Review_Rating'].mean(), inplace=True)


### Encode Categorical Variables

In [5]:
le = LabelEncoder()
df['Product_Category_Encoded'] = le.fit_transform(df['Product_Category'])
df['Customer_Location_Encoded'] = le.fit_transform(df['Customer_Location'])

### Feature Selection using SelectKBest

In [6]:
X = df[['Product_Category_Encoded','Customer_Location_Encoded','Price','Discount','Purchase_Frequency','Review_Rating']]
y = df['Units_Sold']

# Select top 4 features
selector = SelectKBest(score_func=f_regression, k=4)
X_new = selector.fit_transform(X, y)
print(X_new.shape)

(10, 4)


### PCA for Dimensionality Reduction

In [7]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_new)

print(X_pca.shape)

(10, 2)


### Normalize Numerical Features

In [8]:
num_cols = ['Price','Discount','Purchase_Frequency','Review_Rating']
scaler = MinMaxScaler()
df[num_cols] = scaler.fit_transform(df[num_cols])

df.head()


Unnamed: 0,Product_ID,Product_Category,Customer_Location,Price,Discount,Purchase_Frequency,Review_Rating,Units_Sold,Product_Category_Encoded,Customer_Location_Encoded
0,P001,Electronics,Delhi,0.615385,0.5,0.333333,0.666667,100,2,2
1,P002,Clothing,Mumbai,0.128205,0.25,0.0,0.333333,150,1,3
2,P003,Electronics,Bangalore,0.871795,0.75,0.666667,0.525,80,2,0
3,P004,Clothing,Delhi,0.166667,1.0,0.166667,0.0,200,1,2
4,P005,Books,Chennai,0.012821,0.0,0.5,1.0,500,0,1
