In [8]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score
import numpy as np

df = pd.read_excel("C://Users/HP/Downloads/onesignal.xlsx") 
threshold = len(df) * 0.5
cols_to_drop = [col for col in df.columns if df[col].count() < threshold and col != "email_reply_to_address"]
df = df.drop(columns=cols_to_drop)
df = df.drop(columns=["email_click_tracking_disabled"])

df['include_unsubscribed'] = df['include_unsubscribed'].fillna(0)
df = df.drop(columns=["include_unsubscribed"]) 

df['completed_date'] = df['completed_date'].ffill()
df['completed_time'] = df['completed_time'].ffill()

df['completed_date_id'] = df['completed_date_id'].ffill()
df['included_segments'] = df['included_segments'].ffill()

df = df.drop(columns=["app_id","isEmail","frequency_capped", "email_subject"])

# Combine date and time columns into datetime
df['queued_datetime'] = pd.to_datetime(df['queued_date'].astype(str) + ' ' + df['queued_time'].astype(str))
df['send_after_datetime'] = pd.to_datetime(df['send_after_date'].astype(str)+ ' ' + df['send_after_time'].astype(str), errors='coerce')
df['completed_datetime'] = pd.to_datetime(df['completed_date'].astype(str) + ' ' + df['completed_time'].astype(str), errors='coerce')

# Create delay and duration columns
df['delay_minutes'] = (df['send_after_datetime'] - df['queued_datetime']).dt.total_seconds() / 60
df['send_duration_minutes'] = (df['completed_datetime'] - df['send_after_datetime']).dt.total_seconds() / 60
df['total_pipeline_minutes'] = (df['completed_datetime'] - df['queued_datetime']).dt.total_seconds() / 60 

df['send_hour'] = df['send_after_datetime'].dt.hour
df['send_dayofweek'] = df['send_after_datetime'].dt.dayofweek
df['is_weekend'] = df['send_dayofweek'] >= 5

# Drop useless columns
df = df.drop(columns=['queued_date', 'queued_time', 'send_after_date', 'send_after_time',
                      'completed_date', 'completed_time',
                      'queued_date_id', 'send_after_date_id', 'completed_date_id'])

for col in df.columns:
    if df[col].dtype == 'object' or isinstance(df[col].dtype, pd.CategoricalDtype):
        df[col], _ = pd.factorize(df[col])

X = df.drop(columns=["converted"])
Y = df["converted"]

datetime_cols = X.select_dtypes(include=['datetime64[ns]', 'datetime64']).columns
for col in datetime_cols:
    X[col] = X[col].astype(np.int64) 

X = X.fillna(-1)


model = make_pipeline(
    StandardScaler(),
    SGDRegressor(
        loss='squared_error',
        penalty='l2',
        alpha=1.1358052035250219e-06,
        learning_rate='constant',
        eta0=0.027932934457572396,
        fit_intercept=True,
        tol=1.34433762714094e-05,
        average=False,
        random_state=1
    )
)

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)
model.fit(X_train, Y_train)
y_pred = model.predict(X_test)

mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

print(f"MSE: {mse:.4f}")
print(f"R²: {r2:.4f}")

"""
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

X_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2)

rf =SGDRegressor(
        loss='squared_error',
        penalty='l2',
        alpha=1.1358052035250219e-06,
        learning_rate='constant',
        eta0=0.027932934457572396,
        fit_intercept=True,
        tol=1.34433762714094e-05,
        average=False,
        random_state=1
    )

rf.fit(X_train, Y_train)
y_pred = rf.predict(X_test)

#accuracy = accuracy_score(Y_test, y_pred)
mse = mean_squared_error(Y_test, y_pred)
r2 = r2_score(Y_test, y_pred)

#print(f"Accuracy: {accuracy:.4f}")
print(f"MSE:      {mse:.4f}")
print(f"R²:       {r2:.4f}")
"""

MSE: 11796859645605907005440.0000
R²: -26698108352445008.0000


'\nscaler = StandardScaler()\nX_scaled = scaler.fit_transform(X)\n\nX_train, X_test, Y_train, Y_test = train_test_split(X_scaled, Y, test_size=0.2, random_state=2)\n\nrf =SGDRegressor(\n        loss=\'squared_error\',\n        penalty=\'l2\',\n        alpha=1.1358052035250219e-06,\n        learning_rate=\'constant\',\n        eta0=0.027932934457572396,\n        fit_intercept=True,\n        tol=1.34433762714094e-05,\n        average=False,\n        random_state=1\n    )\n\nrf.fit(X_train, Y_train)\ny_pred = rf.predict(X_test)\n\n#accuracy = accuracy_score(Y_test, y_pred)\nmse = mean_squared_error(Y_test, y_pred)\nr2 = r2_score(Y_test, y_pred)\n\n#print(f"Accuracy: {accuracy:.4f}")\nprint(f"MSE:      {mse:.4f}")\nprint(f"R²:       {r2:.4f}")\n'