<a href="https://colab.research.google.com/github/Tejassorte/Python-Projects/blob/master/PodcastPrediction_ANN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

# Define custom RMSE metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Load data
df = pd.read_csv("train.csv")  # Replace with your actual file path

# Check for NaNs in dataset
print("NaNs before handling:")
print(df.isnull().sum())

# Fill missing values in 'Episode_Length_minutes' and 'Guest_Popularity_percentage'
df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].mean(), inplace=True)
df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].mean(), inplace=True)

# Define features and target
X = df.drop(['Listening_Time_minutes', 'Episode_Title', 'id', 'Podcast_Name'], axis=1) # Dropping unnecessary columns
y = df['Listening_Time_minutes']

# Convert numerical columns to float32
for col in ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']:
    X[col] = X[col].astype('float32')
y = y.astype('float32')

# Check for NaNs after handling
print("\nNaNs after handling:")
print(X.isnull().sum())
print(np.any(X.isnull()))

# Define categorical and numerical columns
categorical_cols = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
numerical_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Build model with a lower learning rate
model = Sequential([
    Dense(128, input_dim=X_train_processed.shape[1], activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile model with custom RMSE
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='mean_squared_error',
              metrics=[rmse])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train model with RMSE metric
history = model.fit(
    X_train_processed, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=8,
    callbacks=[early_stopping],
    verbose=1
)

NaNs before handling:
id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       0
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

NaNs after handling:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].mean(), inplace=True)


Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
dtype: int64
False


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 3ms/step - loss: 317.7998 - rmse: 15.4598 - val_loss: 176.7461 - val_rmse: 12.5647
Epoch 2/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 3ms/step - loss: 197.1338 - rmse: 13.2668 - val_loss: 175.9568 - val_rmse: 12.5325
Epoch 3/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 3ms/step - loss: 193.3681 - rmse: 13.1425 - val_loss: 178.2298 - val_rmse: 12.6146
Epoch 4/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 3ms/step - loss: 191.1761 - rmse: 13.0628 - val_loss: 176.6759 - val_rmse: 12.5581
Epoch 5/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 3ms/step - loss: 188.5502 - rmse: 12.9748 - val_loss: 175.6422 - val_rmse: 12.5203
Epoch 6/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m188s[0m 3ms/step - loss: 185.8984 - rmse: 12.8804 - val_loss: 182.3841 - val_rmse: 12.7633
Epoc

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Define custom RMSE metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Build the model
model = Sequential([
    Dense(128, input_dim=X_train_processed.shape[1], activation='relu'),  # More neurons
    BatchNormalization(),  # Stabilize learning
    Dropout(0.3),  # Prevent overfitting
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(1)  # Output layer for regression
])

# Compile with custom RMSE metric
optimizer = Adam(learning_rate=0.0001)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[rmse])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train
history = model.fit(
    X_train_processed, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=8,
    callbacks=[early_stopping],
    verbose=1
)

Epoch 1/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m221s[0m 4ms/step - loss: 572.3119 - rmse: 21.1071 - val_loss: 182.2647 - val_rmse: 12.7892
Epoch 2/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m242s[0m 3ms/step - loss: 236.7978 - rmse: 14.6627 - val_loss: 177.8394 - val_rmse: 12.6083
Epoch 3/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m196s[0m 3ms/step - loss: 215.9993 - rmse: 13.9756 - val_loss: 179.1746 - val_rmse: 12.6633
Epoch 4/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m213s[0m 4ms/step - loss: 207.4147 - rmse: 13.6873 - val_loss: 177.4847 - val_rmse: 12.5927
Epoch 5/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m194s[0m 3ms/step - loss: 202.1378 - rmse: 13.5102 - val_loss: 177.5293 - val_rmse: 12.5837
Epoch 6/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 3ms/step - loss: 198.3818 - rmse: 13.3629 - val_loss: 176.3110 - val_rmse: 12.5418
Epoc

In [None]:
import tensorflow.keras.backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Define custom RMSE metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Build an improved model
model = Sequential([
    Dense(128, input_dim=X_train_processed.shape[1], activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(64, activation='relu'),
    BatchNormalization(),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile with a lower learning rate
optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=[rmse])

# Callbacks for better training
early_stopping = EarlyStopping(
    monitor='val_loss', patience=15, restore_best_weights=True
)

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss', factor=0.2, patience=7, min_lr=1e-6
)

# Fit the model with more epochs and callbacks
history = model.fit(
    X_train_processed, y_train,
    validation_split=0.2,
    epochs=20,  # More epochs for better convergence
    batch_size=8,
    callbacks=[early_stopping, reduce_lr],
    verbose=1
)

Epoch 1/20
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 4ms/step - loss: 308.7704 - rmse: 16.0134 - val_loss: 198.3238 - val_rmse: 13.3898 - learning_rate: 5.0000e-04
Epoch 2/20
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m218s[0m 4ms/step - loss: 203.0337 - rmse: 13.5292 - val_loss: 190.3497 - val_rmse: 13.0235 - learning_rate: 5.0000e-04
Epoch 3/20
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m233s[0m 4ms/step - loss: 196.3224 - rmse: 13.2832 - val_loss: 184.2876 - val_rmse: 12.7995 - learning_rate: 5.0000e-04
Epoch 4/20
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 4ms/step - loss: 193.5843 - rmse: 13.1928 - val_loss: 181.5809 - val_rmse: 12.7090 - learning_rate: 5.0000e-04
Epoch 5/20
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m232s[0m 4ms/step - loss: 191.7590 - rmse: 13.1159 - val_loss: 176.7747 - val_rmse: 12.5606 - learning_rate: 5.0000e-04
Epoch 6/20
[1m60000/60000[0m

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

# Define custom RMSE metric
def rmse(y_true, y_pred):
    return K.sqrt(K.mean(K.square(y_pred - y_true)))

# Load data
df = pd.read_csv("train.csv")  # Replace with your actual file path

# Check for NaNs in dataset
print("NaNs before handling:")
print(df.isnull().sum())

# Fill missing values in 'Episode_Length_minutes' and 'Guest_Popularity_percentage'
df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].mean(), inplace=True)
df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].mean(), inplace=True)

# Drop rows with NaNs in target (if any, though info() shows none)
df.dropna(subset=['Listening_Time_minutes'], inplace=True)

# Define features and target
X = df.drop(['Listening_Time_minutes', 'Episode_Title', 'id', 'Podcast_Name'], axis=1) # Dropping unnecessary columns
y = df['Listening_Time_minutes']

# Convert numerical columns to float32
for col in ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']:
    X[col] = X[col].astype('float32')
y = y.astype('float32')

# Check for NaNs after handling
print("\nNaNs after handling:")
print(X.isnull().sum())
print(np.any(X.isnull()))

# Define categorical and numerical columns
categorical_cols = ['Genre', 'Publication_Day', 'Publication_Time', 'Episode_Sentiment']
numerical_cols = ['Episode_Length_minutes', 'Host_Popularity_percentage', 'Guest_Popularity_percentage', 'Number_of_Ads']

# Preprocessing pipeline
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), numerical_cols),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Transform features
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# Build model with a lower learning rate
model = Sequential([
    Dense(128, input_dim=X_train_processed.shape[1], activation='relu'),
    Dropout(0.2),
    Dense(64, activation='relu'),
    Dropout(0.2),
    Dense(32, activation='relu'),
    Dense(1)
])

# Compile model with custom RMSE
model.compile(optimizer=Adam(learning_rate=0.0001),
              loss='mean_squared_error',
              metrics=[rmse])

# Early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train model with RMSE metric
history = model.fit(
    X_train_processed, y_train,
    validation_split=0.2,
    epochs=10,
    batch_size=8,
    callbacks=[early_stopping],
    verbose=1
)

NaNs before handling:
id                                  0
Podcast_Name                        0
Episode_Title                       0
Episode_Length_minutes          87093
Genre                               0
Host_Popularity_percentage          0
Publication_Day                     0
Publication_Time                    0
Guest_Popularity_percentage    146030
Number_of_Ads                       0
Episode_Sentiment                   0
Listening_Time_minutes              0
dtype: int64

NaNs after handling:


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Episode_Length_minutes'].fillna(df['Episode_Length_minutes'].mean(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Guest_Popularity_percentage'].fillna(df['Guest_Popularity_percentage'].mean(), inplace=True)


Episode_Length_minutes         0
Genre                          0
Host_Popularity_percentage     0
Publication_Day                0
Publication_Time               0
Guest_Popularity_percentage    0
Number_of_Ads                  0
Episode_Sentiment              0
dtype: int64
False


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 3ms/step - loss: 318.2057 - rmse: 15.4962 - val_loss: 177.4747 - val_rmse: 12.5924
Epoch 2/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 3ms/step - loss: 194.9425 - rmse: 13.1970 - val_loss: 182.2597 - val_rmse: 12.7622
Epoch 3/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 3ms/step - loss: 191.8946 - rmse: 13.0989 - val_loss: 179.3392 - val_rmse: 12.6556
Epoch 4/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m198s[0m 3ms/step - loss: 189.8860 - rmse: 13.0121 - val_loss: 179.1548 - val_rmse: 12.6468
Epoch 5/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m202s[0m 3ms/step - loss: 186.3464 - rmse: 12.9037 - val_loss: 189.2628 - val_rmse: 13.0025
Epoch 6/10
[1m60000/60000[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m185s[0m 3ms/step - loss: 184.3038 - rmse: 12.8220 - val_loss: 196.6060 - val_rmse: 13.2564
Epoc