In [264]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import warnings
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

In [265]:
data = pd.read_csv("Audiobooks_data.csv", header=None)
df = data.copy()
df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,873,2160.0,2160,10.13,10.13,0,8.91,0.0,0.0,0,0,1
1,611,1404.0,2808,6.66,13.33,1,6.5,0.0,0.0,0,182,1
2,705,324.0,324,10.13,10.13,1,9.0,0.0,0.0,1,334,1
3,391,1620.0,1620,15.31,15.31,0,9.0,0.0,0.0,0,183,1
4,819,432.0,1296,7.11,21.33,1,9.0,0.0,0.0,0,0,1


In [266]:
df.isna().sum()

0     0
1     0
2     0
3     0
4     0
5     0
6     0
7     0
8     0
9     0
10    0
11    0
dtype: int64

In [267]:
df.dtypes

0       int64
1     float64
2       int64
3     float64
4     float64
5       int64
6     float64
7     float64
8     float64
9       int64
10      int64
11      int64
dtype: object

In [268]:
df.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
count,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0,14084.0
mean,16772.491551,1591.281685,1678.608634,7.103791,7.543805,0.16075,8.909795,0.125659,118.586745,0.070222,61.935033,0.158833
std,9691.807248,504.340663,654.838599,4.931673,5.560129,0.367313,0.643406,0.241206,268.731935,0.472157,88.207634,0.365533
min,2.0,216.0,216.0,3.86,3.86,0.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,8368.0,1188.0,1188.0,5.33,5.33,0.0,8.91,0.0,0.0,0.0,0.0,0.0
50%,16711.5,1620.0,1620.0,5.95,6.07,0.0,8.91,0.0,0.0,0.0,11.0,0.0
75%,25187.25,2160.0,2160.0,8.0,8.0,0.0,8.91,0.13,64.8,0.0,105.0,0.0
max,33683.0,2160.0,7020.0,130.94,130.94,1.0,10.0,1.0,2116.8,30.0,464.0,1.0


In [269]:
X = df.iloc[:,1:-1]
y = df.iloc[:,-1]

In [270]:
print(X.shape,y.shape)

(14084, 10) (14084,)


In [271]:
y.value_counts()

11
0    11847
1     2237
Name: count, dtype: int64

In [272]:
# For classification we need to do balancing
ones_df = df[df.iloc[:,-1] == 1]
zeros_df = df[df.iloc[:,-1] == 0].iloc[:len(ones_df)]
df_balanced = pd.concat([ones_df,zeros_df]).sample(frac=1).reset_index(drop=True)

In [273]:
df_balanced.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,8903,1008.0,3024,8.0,24.0,0,8.91,0.0,0.0,0,0,1
1,16499,864.0,1728,10.75,21.49,0,8.91,0.0,0.0,0,142,1
2,9911,648.0,648,5.33,5.33,0,8.91,0.0,680.4,0,0,0
3,21439,1620.0,1620,5.33,5.33,0,8.91,0.15,567.0,0,11,0
4,8208,1620.0,1620,5.68,5.68,0,8.91,0.22,734.4,1,103,0


In [274]:
df_balanced.shape

(4474, 12)

In [275]:
X = df_balanced.iloc[:,1:-1]
y = df_balanced.iloc[:,-1]

In [276]:
print(X.shape,y.shape)

(4474, 10) (4474,)


In [277]:
X_train, X_temp, y_train, y_temp = train_test_split(X,y,test_size=0.2, stratify=y, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

In [278]:
print(f"Training data: {X_train.shape} and {y_train.shape}, no of 1's: {sum(y_train)}")
print(f"Validation data: {X_val.shape} and {y_val.shape}, no of 1's: {sum(y_val)}")
print(f"Testing data: {X_test.shape} and {y_test.shape}, no of 1's: {sum(y_test)}")

Training data: (3579, 10) and (3579,), no of 1's: 1790
Validation data: (447, 10) and (447,), no of 1's: 223
Testing data: (448, 10) and (448,), no of 1's: 224


In [279]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

In [280]:
model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation = 'relu'),
    Dense(32, activation = 'relu'),
    Dense(1, activation = 'sigmoid')
])

In [281]:
model.compile(optimizer='adam', loss = 'binary_crossentropy', metrics=['accuracy'])

In [282]:
early_stop = EarlyStopping(patience=5, restore_best_weights=True)

In [283]:
model.fit(X_train_scaled, y_train,
          validation_data=(X_val_scaled,y_val),
          epochs = 50,
          batch_size= 32,
          callbacks=[early_stop])

Epoch 1/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.7476 - loss: 0.5537 - val_accuracy: 0.8993 - val_loss: 0.3101
Epoch 2/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8811 - loss: 0.3231 - val_accuracy: 0.9060 - val_loss: 0.2745
Epoch 3/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8827 - loss: 0.3046 - val_accuracy: 0.9105 - val_loss: 0.2600
Epoch 4/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9051 - loss: 0.2706 - val_accuracy: 0.9128 - val_loss: 0.2430
Epoch 5/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.9009 - loss: 0.2672 - val_accuracy: 0.9128 - val_loss: 0.2380
Epoch 6/50
[1m112/112[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step - accuracy: 0.8988 - loss: 0.2561 - val_accuracy: 0.9150 - val_loss: 0.2345
Epoch 7/50
[1m112/112[0m 

<keras.src.callbacks.history.History at 0x1b3e224c6d0>

In [284]:
loss, accuracy = model.evaluate(X_test_scaled,y_test)
print(f"Accuracy: {accuracy * 100:.2f}%")

[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.9066 - loss: 0.2610 
Accuracy: 91.29%
