# Audiobook Customer Rentention Prediction
Predict whether a customer will buy another audiobook or not.
Dataset has been collected over 2 years.

# Import necessary libraries

In [42]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import tensorflow as tf

# Load and preview data

In [43]:
data = pd.read_csv("Audiobooks_data.csv", header=None)
df = data.copy()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,873,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


In [44]:
df.shape

(14084, 12)

In [45]:
df.columns

Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11], dtype='int64')

In [46]:
df.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
dtype: int64

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14084 entries, 0 to 14083
Data columns (total 12 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   0       14084 non-null  int64  
 1   1       14084 non-null  float64
 2   2       14084 non-null  int64  
 3   3       14084 non-null  float64
 4   4       14084 non-null  float64
 5   5       14084 non-null  int64  
 6   6       14084 non-null  float64
 7   7       14084 non-null  float64
 8   8       14084 non-null  float64
 9   9       14084 non-null  int64  
 10  10      14084 non-null  int64  
 11  11      14084 non-null  int64  
dtypes: float64(6), int64(6)
memory usage: 1.3 MB


In [48]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
count,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0
mean,16772.491551,1591.281685,1678.608634,7.103791,7.543805,0.16075,8.909795,0.125659,118.586745,0.070222,61.935033,0.158833
std,9691.807248,504.340663,654.838599,4.931673,5.560129,0.367313,0.643406,0.241206,268.731935,0.472157,88.207634,0.365533
min,2.0,216.0,216.0,3.86,3.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8368.0,1188.0,1188.0,5.33,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0.0
50%,16711.5,1620.0,1620.0,5.95,6.07,0.0,8.91,0.0,0.0,0.0,11.0,0.0
75%,25187.25,2160.0,2160.0,8.0,8.0,0.0,8.91,0.13,64.8,0.0,105.0,0.0
max,33683.0,2160.0,7020.0,130.94,130.94,1.0,10.0,1.0,2116.8,30.0,464.0,1.0


In [49]:
df = df.drop(columns=df.columns[0])

In [50]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,11
0,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


In [51]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [52]:
X.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
0,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0
1,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182
2,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334
3,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183
4,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0


In [53]:
y.head()

0    1
1    1
2    1
3    1
4    1
Name: 11, dtype: int64

In [54]:
print(X.shape,y.shape)

(14084, 10) (14084,)


In [55]:
y.value_counts()

11
0    11847
1     2237
Name: count, dtype: int64

# Preprocessing

In [56]:
from sklearn.utils import shuffle

df = pd.concat([pd.DataFrame(X), pd.Series(y, name='target')], axis=1)

In [57]:
df.head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,target
0,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


In [58]:
# separate into majority and minority classes

df_majority = df[df['target'] == 0]
df_minority = df[df['target'] == 1]

In [59]:
# undersample the majority class
df_majority_undersampled = df_majority.sample(n=len(df_minority), random_state=42)


In [60]:
# combine undersampled majority and full minority class
df_balanced = pd.concat([df_majority_undersampled, df_minority])

In [61]:
# shuffle the balanced dataframe
df_balanced = shuffle(df_balanced, random_state=42)  # it is a df

In [62]:
# Split back into X and y
X_balanced = df_balanced.drop('target', axis=1).values # it is nd array
y_balanced = df_balanced['target'].values  # it is nd array

In [63]:
print(X_balanced.shape, y_balanced.shape)

(4474, 10) (4474,)


In [64]:
# Now dividing into train, validation and test 
# we will do two times - Train(80%) and Temp(20%)
X_train, X_temp, y_train, y_temp = train_test_split(X_balanced,y_balanced, test_size=0.2,
                                                    stratify=y_balanced, random_state=42)

In [65]:
# second split
X_val, X_test, y_val, y_test = train_test_split(X_temp,y_temp, test_size=0.5,
                                                stratify=y_temp, random_state=42)

In [66]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape)

(3579, 10) (3579,) (447, 10) (447,) (448, 10) (448,)


In [67]:
# this is wrong coz we are doing scaling after split where as the correct way should be scaling then
# split as we did in second notebook
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

# Modeling

In [68]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input

In [69]:
model = Sequential([
    Input(shape=(10,)),
    Dense(50, activation='relu'),
    Dense(50, activation='relu'),
    Dense(1, activation='sigmoid')
])

In [70]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

In [71]:
from tensorflow.keras.callbacks import EarlyStopping
early_stop = EarlyStopping(patience=5, restore_best_weights=True)

In [72]:
model.fit(X_train_scaled, y_train,
          validation_data = (X_val_scaled,y_val),
          epochs = 50,  batch_size = 32, callbacks=[early_stop],
          verbose=2)

Epoch 1/50
112/112 - 5s - 43ms/step - accuracy: 0.6985 - loss: 0.5800 - val_accuracy: 0.7427 - val_loss: 0.4809
Epoch 2/50
112/112 - 1s - 6ms/step - accuracy: 0.7726 - loss: 0.4429 - val_accuracy: 0.7763 - val_loss: 0.4149
Epoch 3/50
112/112 - 2s - 14ms/step - accuracy: 0.7888 - loss: 0.4082 - val_accuracy: 0.8076 - val_loss: 0.3900
Epoch 4/50
112/112 - 1s - 11ms/step - accuracy: 0.7885 - loss: 0.3936 - val_accuracy: 0.8143 - val_loss: 0.3790
Epoch 5/50
112/112 - 1s - 12ms/step - accuracy: 0.8013 - loss: 0.3824 - val_accuracy: 0.8166 - val_loss: 0.3737
Epoch 6/50
112/112 - 1s - 9ms/step - accuracy: 0.8036 - loss: 0.3760 - val_accuracy: 0.8098 - val_loss: 0.3717
Epoch 7/50
112/112 - 1s - 11ms/step - accuracy: 0.8047 - loss: 0.3708 - val_accuracy: 0.8188 - val_loss: 0.3669
Epoch 8/50
112/112 - 1s - 8ms/step - accuracy: 0.8030 - loss: 0.3672 - val_accuracy: 0.8233 - val_loss: 0.3646
Epoch 9/50
112/112 - 1s - 7ms/step - accuracy: 0.8083 - loss: 0.3661 - val_accuracy: 0.8389 - val_loss: 0.3

<keras.src.callbacks.history.History at 0x1730e765410>

In [73]:
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy:.2%}")


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.5001 - loss: 1292.4283
Test accuracy: 50.00%
