In [1]:
import pandas as pd
# Load dataset
data = pd.read_csv('DataPreprocessing.csv')
print(data.head())

   Region   Age   Income Online Shopper
0   India  49.0  86400.0             No
1  Brazil  32.0  57600.0            Yes
2     USA  35.0  64800.0             No
3  Brazil  43.0  73200.0             No
4     USA  45.0      NaN            Yes


In [2]:
# Fill missing values with the mean for numerical columns
data['Age'].fillna(data['Age'].mean(), inplace=True)
data['Income'].fillna(data['Income'].mean(), inplace=True)

print(data)


   Region        Age        Income Online Shopper
0   India  49.000000  86400.000000             No
1  Brazil  32.000000  57600.000000            Yes
2     USA  35.000000  64800.000000             No
3  Brazil  43.000000  73200.000000             No
4     USA  45.000000  76533.333333            Yes
5   India  40.000000  69600.000000            Yes
6  Brazil  43.777778  62400.000000             No
7   India  53.000000  94800.000000            Yes
8     USA  55.000000  99600.000000             No
9   India  42.000000  80400.000000            Yes


In [3]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

# One-hot encoding for categorical features
data = pd.get_dummies(data, columns=['Region'])

# Label encoding for the target variable
label_encoder = LabelEncoder()
data['Online Shopper'] = label_encoder.fit_transform(data['Online Shopper'])

print(data)


         Age        Income  Online Shopper  Region_Brazil  Region_India  \
0  49.000000  86400.000000               0              0             1   
1  32.000000  57600.000000               1              1             0   
2  35.000000  64800.000000               0              0             0   
3  43.000000  73200.000000               0              1             0   
4  45.000000  76533.333333               1              0             0   
5  40.000000  69600.000000               1              0             1   
6  43.777778  62400.000000               0              1             0   
7  53.000000  94800.000000               1              0             1   
8  55.000000  99600.000000               0              0             0   
9  42.000000  80400.000000               1              0             1   

   Region_USA  
0           0  
1           0  
2           1  
3           0  
4           1  
5           0  
6           0  
7           0  
8           1  
9           0 

In [4]:
from sklearn.preprocessing import StandardScaler

# Standard scaling (mean=0, std=1)
scaler = StandardScaler()
data[['Age', 'Income']] = scaler.fit_transform(data[['Age', 'Income']])

print(data)


        Age    Income  Online Shopper  Region_Brazil  Region_India  Region_USA
0  0.758874  0.749473               0              0             1           0
1 -1.711504 -1.438178               1              1             0           0
2 -1.275555 -0.891265               0              0             0           1
3 -0.113024 -0.253200               0              1             0           0
4  0.177609  0.000000               1              0             0           1
5 -0.548973 -0.526657               1              0             1           0
6  0.000000 -1.073570               0              1             0           0
7  1.340140  1.387538               1              0             1           0
8  1.630773  1.752147               0              0             0           1
9 -0.258340  0.293712               1              0             1           0


In [5]:
# Example of creating interaction features (commented out as it's not required for this dataset)
# data['Age_Income'] = data['Age'] * data['Income']

# Example of creating polynomial features (commented out as it's not required for this dataset)
# data['Age_squared'] = data['Age'] ** 2
# data['Income_squared'] = data['Income'] ** 2

print(data)


        Age    Income  Online Shopper  Region_Brazil  Region_India  Region_USA
0  0.758874  0.749473               0              0             1           0
1 -1.711504 -1.438178               1              1             0           0
2 -1.275555 -0.891265               0              0             0           1
3 -0.113024 -0.253200               0              1             0           0
4  0.177609  0.000000               1              0             0           1
5 -0.548973 -0.526657               1              0             1           0
6  0.000000 -1.073570               0              1             0           0
7  1.340140  1.387538               1              0             1           0
8  1.630773  1.752147               0              0             0           1
9 -0.258340  0.293712               1              0             1           0


In [6]:
from sklearn.model_selection import train_test_split

# Define feature columns and target column
X = data.drop('Online Shopper', axis=1)
y = data['Online Shopper']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(X_train.head())
print(y_train.head())


        Age    Income  Region_Brazil  Region_India  Region_USA
5 -0.548973 -0.526657              0             1           0
0  0.758874  0.749473              0             1           0
7  1.340140  1.387538              0             1           0
2 -1.275555 -0.891265              0             0           1
9 -0.258340  0.293712              0             1           0
5    1
0    0
7    1
2    0
9    1
Name: Online Shopper, dtype: int32
