# Install All Dependencies and Load the Dataset

In [3]:
# Import all necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from datetime import datetime, timedelta

# Load the dataset directly from the URL
url = "https://raw.githubusercontent.com/RiyaShirke/EmailPredictionSystem/refs/heads/main/Email_Engagement_Dataset_new.csv?token=GHSAT0AAAAAACYLB5ORQNPTDTG54YLWGUYUZX5CQRA"
email_response_df = pd.read_csv(url)

# Display the first few rows to verify the data
email_response_df.head()

Unnamed: 0,Email Subject,Email Body,Time Sent,Response Time,Level,Engagement score
0,Webinar Invitation,I am following up on our last conversation. Le...,2023-03-18 03:35:00,2023-03-18 08:35:00,Hot,80
1,Project Update,Attached is the monthly report. Please review ...,2023-12-06 09:21:00,2023-12-06 17:21:00,High,63
2,Contract Renewal,Join our upcoming webinar on industry trends. ...,2023-10-16 09:38:00,2023-10-16 16:38:00,High,100
3,Project Update,Here is an update on the project progress. Let...,2023-01-01 04:40:00,2023-01-01 16:40:00,High,83
4,Webinar Invitation,"Hello, we would like to schedule a meeting to ...",2023-11-17 19:28:00,2023-11-20 05:28:00,Cold,52


# Feature Engineering

In [4]:
# Add new feature: Email Length
email_response_df['Email Length'] = email_response_df['Email Body'].apply(len)

# Extract hour of the day from Time Sent
email_response_df['Time Sent Hour'] = pd.to_datetime(email_response_df['Time Sent']).dt.hour

# Calculate response delay (in hours)
email_response_df['Response Delay'] = (pd.to_datetime(email_response_df['Response Time']) - pd.to_datetime(email_response_df['Time Sent'])).dt.total_seconds() / 3600

# Display the first few rows after feature engineering
email_response_df.head()


Unnamed: 0,Email Subject,Email Body,Time Sent,Response Time,Level,Engagement score,Email Length,Time Sent Hour,Response Delay
0,Webinar Invitation,I am following up on our last conversation. Le...,2023-03-18 03:35:00,2023-03-18 08:35:00,Hot,80,77,3,5.0
1,Project Update,Attached is the monthly report. Please review ...,2023-12-06 09:21:00,2023-12-06 17:21:00,High,63,68,9,8.0
2,Contract Renewal,Join our upcoming webinar on industry trends. ...,2023-10-16 09:38:00,2023-10-16 16:38:00,High,100,59,9,7.0
3,Project Update,Here is an update on the project progress. Let...,2023-01-01 04:40:00,2023-01-01 16:40:00,High,83,69,4,12.0
4,Webinar Invitation,"Hello, we would like to schedule a meeting to ...",2023-11-17 19:28:00,2023-11-20 05:28:00,Cold,52,65,19,58.0


# Preparing Features and Labels

In [5]:
# Prepare features and labels
X = email_response_df[['Email Subject', 'Email Body', 'Time Sent Hour', 'Email Length', 'Response Delay']]
y = email_response_df['Level']

# Encode target labels (Cold, Hot, High) to numerical values
le = LabelEncoder()
y_encoded = le.fit_transform(y)


# Vectorizing Text Data

In [6]:
# Vectorize email subject and body
vectorizer_subject = TfidfVectorizer(max_features=500)
X_tfidf_subject = vectorizer_subject.fit_transform(X['Email Subject'])

vectorizer_body = TfidfVectorizer(max_features=500)
X_tfidf_body = vectorizer_body.fit_transform(X['Email Body'])

# Combine text vectors with other numerical features
X_combined = np.hstack([X_tfidf_subject.toarray(), X_tfidf_body.toarray(), X[['Time Sent Hour', 'Email Length', 'Response Delay']].values])


# Handling Class Imbalance (Optional)

In [7]:
# Apply SMOTE to balance the dataset
smote = SMOTE(random_state=42)
X_balanced, y_balanced = smote.fit_resample(X_combined, y_encoded)


# Train-Test Split

In [8]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)


# Train the Model

In [9]:
# Train a Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)


# Evaluate the Model

In [10]:
# Predict on the test set
y_pred = model.predict(X_test)

# Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')


Accuracy: 1.0


# Testing the Model with Sample Inputs

In [11]:
# Sample test emails for each class (Hot, High, Cold)
sample_emails = pd.DataFrame({
    'Email Subject': ["Urgent Meeting Request", "Special Offer - Limited Time!", "Follow-Up on Last Conversation"],
    'Email Body': [
        "We need to schedule an urgent meeting to discuss the project update.",
        "Get 20% off on your next purchase! Offer valid till end of the week.",
        "Just checking in to see if you had any questions about our last conversation."
    ],
    'Time Sent Hour': [10, 15, 9],  # Example hours
    'Email Length': [len("We need to schedule an urgent meeting to discuss the project update."),
                     len("Get 20% off on your next purchase! Offer valid till end of the week."),
                     len("Just checking in to see if you had any questions about our last conversation.")],
    'Response Delay': [3, 25, 48]  # Example response delays in hours (Hot, High, Cold)
})

# Vectorize the email subjects and bodies using the same TF-IDF vectorizer as before
X_sample_tfidf_subject = vectorizer_subject.transform(sample_emails['Email Subject'])
X_sample_tfidf_body = vectorizer_body.transform(sample_emails['Email Body'])

# Combine the text features with numerical features
X_sample_combined = np.hstack([X_sample_tfidf_subject.toarray(), X_sample_tfidf_body.toarray(), sample_emails[['Time Sent Hour', 'Email Length', 'Response Delay']].values])

# Predict the class using the trained model
y_sample_pred = model.predict(X_sample_combined)

# Decode the predicted labels back to their original form (Cold, Hot, High)
y_sample_pred_labels = le.inverse_transform(y_sample_pred)

# Print out the predictions
for i, label in enumerate(y_sample_pred_labels):
    print(f"Sample {i+1}: Predicted class is '{label}'")


Sample 1: Predicted class is 'Hot'
Sample 2: Predicted class is 'Cold'
Sample 3: Predicted class is 'Cold'
