In [4]:
import pandas as pd

In [5]:
df = pd.read_csv(r"/content/MissingValueDataset.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22589 entries, 0 to 22588
Data columns (total 12 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   User_ID                     22589 non-null  int64  
 1   Product_ID                  22589 non-null  object 
 2   Gender                      22589 non-null  object 
 3   Age                         22589 non-null  object 
 4   Occupation                  22588 non-null  float64
 5   City_Category               22588 non-null  object 
 6   Stay_In_Current_City_Years  22588 non-null  object 
 7   Marital_Status              22588 non-null  float64
 8   Product_Category_1          22588 non-null  float64
 9   Product_Category_2          15374 non-null  float64
 10  Product_Category_3          6786 non-null   float64
 11  Purchase                    22588 non-null  float64
dtypes: float64(6), int64(1), object(5)
memory usage: 2.1+ MB


In [7]:
selected_df = df[['Gender', 'Age', 'Occupation', 'City_Category', 'Stay_In_Current_City_Years', 'Marital_Status', 'Product_Category_1', 'Purchase']]

In [8]:
selected_df[['Marital_Status', 'Product_Category_1', 'Purchase']].corr()

Unnamed: 0,Marital_Status,Product_Category_1,Purchase
Marital_Status,1.0,0.012825,0.009731
Product_Category_1,0.012825,1.0,-0.322222
Purchase,0.009731,-0.322222,1.0


In [9]:
selected_df.head()

Unnamed: 0,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Purchase
0,F,0-17,10.0,A,2,0.0,3.0,8370.0
1,F,0-17,10.0,A,2,0.0,1.0,15200.0
2,F,0-17,10.0,A,2,0.0,12.0,1422.0
3,F,0-17,10.0,A,2,0.0,12.0,1057.0
4,M,55+,16.0,C,4+,0.0,8.0,7969.0


- Gender <-> OHE
- Age <-> LE
- City_Category <-> OHE
- Stay_In_Current_City_Years <-> LE

In [10]:
selected_df['Purchase']

Unnamed: 0,Purchase
0,8370.0
1,15200.0
2,1422.0
3,1057.0
4,7969.0
...,...
22584,2768.0
22585,16677.0
22586,16123.0
22587,3896.0


In [11]:
selected_df.dropna(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  selected_df.dropna(inplace=True)


In [12]:
from sklearn.model_selection import train_test_split

X = selected_df.drop(columns=['Purchase'])
y = selected_df['Purchase']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler

In [14]:
nominal_features = ['Gender', 'City_Category']
ordinal_features = ['Age', 'Stay_In_Current_City_Years']
numerical_columns = ['Occupation', 'Marital_Status', 'Product_Category_1']

In [31]:
preprocessor = ColumnTransformer(
    transformers=[
        ('ohe', OneHotEncoder(), nominal_features),
        ('le', OrdinalEncoder(), ordinal_features)    ],
)

In [32]:
preprocessor

In [33]:
transformed_df = preprocessor.fit_transform(X_train)

In [34]:
transformed_df

array([[1., 0., 0., ..., 0., 3., 3.],
       [0., 1., 0., ..., 1., 5., 0.],
       [0., 1., 0., ..., 1., 1., 4.],
       ...,
       [0., 1., 0., ..., 1., 1., 2.],
       [1., 0., 1., ..., 0., 2., 1.],
       [1., 0., 0., ..., 1., 6., 1.]])

In [35]:
transformed_df.shape

(18070, 7)

In [36]:
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
import numpy as np

In [39]:
model = keras.Sequential([
    keras.layers.Dense(10, input_shape=(7,), activation='relu'),
    keras.layers.Dense(9, activation='relu'),
    keras.layers.Dense(5, activation='relu'),
    keras.layers.Dense(3, activation='relu'),
    keras.layers.Dense(1, activation='relu')
])

model.compile(optimizer='adam',
              loss='mean_absolute_error',
              metrics=['mae'])

# Check for GPU availability and train on GPU if available
with tf.device('/GPU:0' if tf.config.list_physical_devices('GPU') else '/CPU:0'):
  model.fit(transformed_df, y_train, epochs=50)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 4ms/step - loss: 9110.1553 - mae: 9110.1553
Epoch 2/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 4430.3110 - mae: 4430.3110
Epoch 3/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - loss: 4178.0942 - mae: 4178.0942
Epoch 4/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 4110.4683 - mae: 4110.4683
Epoch 5/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3995.1345 - mae: 3995.1345
Epoch 6/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3952.8713 - mae: 3952.8713
Epoch 7/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3936.6094 - mae: 3936.6094
Epoch 8/50
[1m565/565[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 3874.0603 - mae: 3874.0603
Epoch 9/50
[1m565/565[0m [32m

In [42]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [26]:
model.predict(preprocessor.transform(X_test))[15]

[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step


array([7865.919], dtype=float32)

In [27]:
y_test.iloc[15]

np.float64(4037.0)

In [28]:
predicted = model.predict(preprocessor.transform(X_test))

[1m142/142[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [30]:
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error, r2_score

mae = mean_absolute_error(y_test, predicted)
mape = mean_absolute_percentage_error(y_test, predicted)
# Accuracy is not a standard metric for regression. R-squared is more appropriate.
# I will calculate R-squared instead of accuracy.
r2 = r2_score(y_test, predicted)


print(f'Mean Absolute Error (MAE): {mae}')
print(f'Mean Absolute Percentage Error (MAPE): {mape}')
print(f'Accuracy: {100 - mape * 100}')

Mean Absolute Error (MAE): 2975.1557048714863
Mean Absolute Percentage Error (MAPE): 0.6716901762872234
Accuracy: 32.830982371277656
