In [9]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [10]:
data = pd.read_csv("./data.csv")

In [11]:
data

Unnamed: 0,email_type,subject,body,opened,meeting_link_clicked,responded,subject_length,body_length,mean_word_length_subject,mean_word_length_body,mean_sent_length_subject,mean_sent_length_body
0,example1,propel marketing roi advanced analytics,hey recipient name ready see marketing perform...,0,0,0,5,35,7.000000,6.114286,39.0,248.0
1,example1,data superpower unlock insights us,hi recipient name reaching believe last messag...,1,0,0,5,34,6.000000,6.617647,34.0,258.0
2,example1,turn marketing data decisions let explore,greetings recipient name connected yet convinc...,0,0,0,6,44,6.000000,6.477273,41.0,328.0
3,example1,marketing success click away let chat analytics,hello recipient name clear serious business su...,1,0,1,7,36,5.857143,6.055556,47.0,253.0
4,example1,boost brand visibility proven marketing analyt...,hi recipient name digital age data power espec...,0,0,0,7,41,7.714286,6.414634,60.0,303.0
...,...,...,...,...,...,...,...,...,...,...,...,...
149,HRConsultingSeries,streamline hr boost performance,hello recipient name efficiency king especiall...,1,0,1,4,26,7.000000,6.461538,31.0,193.0
150,example1,boost brand visibility,hello recipient name want skyrocket company ma...,1,0,0,3,27,6.666667,6.111111,22.0,191.0
151,example1,outpace competitors insightful analytics,hi recipient name noticed navigating vast univ...,1,0,0,4,25,9.250000,5.960000,40.0,173.0
152,example1,tailored analytics,hey recipient name brand deserves stand custom...,0,0,0,2,31,8.500000,6.161290,18.0,221.0


### Training data for predicting open rate

In [12]:
columns = ['email_type','subject','body','subject_length','body_length','mean_word_length_subject','mean_word_length_body','mean_sent_length_subject','mean_sent_length_body','opened']


In [13]:
train_data = data[columns]

In [14]:
train_data

Unnamed: 0,email_type,subject,body,subject_length,body_length,mean_word_length_subject,mean_word_length_body,mean_sent_length_subject,mean_sent_length_body,opened
0,example1,propel marketing roi advanced analytics,hey recipient name ready see marketing perform...,5,35,7.000000,6.114286,39.0,248.0,0
1,example1,data superpower unlock insights us,hi recipient name reaching believe last messag...,5,34,6.000000,6.617647,34.0,258.0,1
2,example1,turn marketing data decisions let explore,greetings recipient name connected yet convinc...,6,44,6.000000,6.477273,41.0,328.0,0
3,example1,marketing success click away let chat analytics,hello recipient name clear serious business su...,7,36,5.857143,6.055556,47.0,253.0,1
4,example1,boost brand visibility proven marketing analyt...,hi recipient name digital age data power espec...,7,41,7.714286,6.414634,60.0,303.0,0
...,...,...,...,...,...,...,...,...,...,...
149,HRConsultingSeries,streamline hr boost performance,hello recipient name efficiency king especiall...,4,26,7.000000,6.461538,31.0,193.0,1
150,example1,boost brand visibility,hello recipient name want skyrocket company ma...,3,27,6.666667,6.111111,22.0,191.0,1
151,example1,outpace competitors insightful analytics,hi recipient name noticed navigating vast univ...,4,25,9.250000,5.960000,40.0,173.0,1
152,example1,tailored analytics,hey recipient name brand deserves stand custom...,2,31,8.500000,6.161290,18.0,221.0,0


In [97]:
train_data.isna().sum()

email_type                  0
subject                     0
body                        0
subject_length              0
body_length                 0
mean_word_length_subject    0
mean_word_length_body       0
mean_sent_length_subject    0
mean_sent_length_body       0
opened                      0
dtype: int64

### Split into Train Test and Cross validation

In [152]:
from sklearn.model_selection import train_test_split

In [153]:
X = train_data.iloc[:,:-1]

In [154]:
X.head(1)

Unnamed: 0,email_type,subject,body,subject_length,body_length,mean_word_length_subject,mean_word_length_body,mean_sent_length_subject,mean_sent_length_body
0,example1,propel marketing roi advanced analytics,hey recipient name ready see marketing perform...,5,35,7.0,6.114286,39.0,248.0


In [155]:
y = train_data.iloc[:,-1]

In [156]:
y.head(5)

0    0
1    1
2    0
3    1
4    0
Name: opened, dtype: int64

In [157]:
# split the full data 80:20 into training:validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, shuffle = True, random_state=101)

# split training data 87.5:12.5 into training:testing sets
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, train_size=0.875,shuffle = True,random_state=101)

print("len(X): {} len(y): {} \nlen(X_train): {}, len(X_valid): {}, len(X_test): \
{} \nlen(y_train): {}, len(y_valid): {}, len(y_test): {}".format(len(X), len(y),\
len(X_train), len(X_valid), len(X_test), len(y_train), len(y_valid), \
len(y_test))) 

len(X): 154 len(y): 154 
len(X_train): 107, len(X_valid): 31, len(X_test): 16 
len(y_train): 107, len(y_valid): 31, len(y_test): 16


In [158]:
y_train.sum()

81

### Here We need to convert text to vectors in order to train our model so we'll use following Techniques
1. TF - IDF (Term Frequency and Inverse Document Frequency)
2. BERT/Transformers with fine tuning (Future Improvement)

#### TF-IDF

In [159]:
#import required libraries
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords

In [160]:
#setup for english language
stemmer  = SnowballStemmer(language='english')

In [161]:
#function to create tokenizer
def tokenize(text):
    return [stemmer.stem(token) for token in word_tokenize(text)]

In [162]:
# test tokenize function
tokenize("Hey Good morning")

['hey', 'good', 'morn']

In [163]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [164]:
stop = set(stopwords.words('english'))

In [165]:
#creating vectorizer
vectorizer = TfidfVectorizer(tokenizer=tokenize,
                             stop_words = stop,
                             ngram_range=(1,2),
                             max_features=2000)

In [166]:
vectorizer.fit(pd.concat([X_train.body,X_train.subject],axis = 0))



TfidfVectorizer(max_features=2000, ngram_range=(1, 2),
                stop_words={'a', 'about', 'above', 'after', 'again', 'against',
                            'ain', 'all', 'am', 'an', 'and', 'any', 'are',
                            'aren', "aren't", 'as', 'at', 'be', 'because',
                            'been', 'before', 'being', 'below', 'between',
                            'both', 'but', 'by', 'can', 'couldn', "couldn't", ...},
                tokenizer=<function tokenize at 0x7fe389135e50>)

In [167]:
len(vectorizer.vocabulary_)

2000

In [168]:
vectorizer.get_feature_names_out()[:20]

array(['access', 'achiev', 'acquisit', 'act', 'action', 'action insight',
       'action let', 'action market', 'action strategi', 'advanc',
       'advanc analyt', 'advanc market', 'advantag', 'advisor',
       'advisori', 'ahead', 'alloc', 'alloc market', 'analysi', 'analyt'],
      dtype=object)

#### Transform Training & validation Data
- Transform phrases from training set
- Transform phrases from Validation set
- Look at some example values

In [169]:
%%time
body_inputs = vectorizer.transform(X_train.body)

CPU times: user 78.4 ms, sys: 2.9 ms, total: 81.3 ms
Wall time: 82 ms


In [170]:
body_inputs.shape

(107, 2000)

In [171]:
%%time
subject_inputs = vectorizer.transform(X_train.subject)

CPU times: user 23.4 ms, sys: 2.32 ms, total: 25.8 ms
Wall time: 57.8 ms


In [172]:
subject_inputs.shape

(107, 2000)

In [173]:
%%time
validation_inputs_body = vectorizer.transform(X_valid.body)

CPU times: user 26.5 ms, sys: 1.38 ms, total: 27.8 ms
Wall time: 28 ms


In [174]:
validation_inputs_body.shape

(31, 2000)

In [175]:
%%time
validation_inputs_subject = vectorizer.transform(X_valid.subject)

CPU times: user 8.46 ms, sys: 639 µs, total: 9.1 ms
Wall time: 10.5 ms


In [176]:
body_feature_train = pd.DataFrame(body_inputs.toarray(), columns=vectorizer.get_feature_names()).reset_index(drop=True)



In [177]:
subject_feature_train = pd.DataFrame(subject_inputs.toarray(), columns=vectorizer.get_feature_names()).reset_index(drop=True)



In [178]:
body_feature_validation = pd.DataFrame(validation_inputs_body.toarray(), columns=vectorizer.get_feature_names()).reset_index(drop=True)



In [179]:
subject_feature_validation = pd.DataFrame(validation_inputs_subject.toarray(), columns=vectorizer.get_feature_names()).reset_index(drop=True)




In [180]:
body_feature_train

Unnamed: 0,access,achiev,acquisit,act,action,action insight,action let,action market,action strategi,advanc,...,would avail,would good,would great,would like,would love,would week,yes,yes meet,yet,yet convinc
0,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.124105,0.0,0.0,0.0,0.000000,0.0
1,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
2,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
3,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
4,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
102,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.161829,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.142072,0.0
103,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
104,0.0,0.0,0.127387,0.0,0.085007,0.099309,0.0,0.0,0.0,0.000000,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0
105,0.0,0.0,0.000000,0.0,0.000000,0.000000,0.0,0.0,0.0,0.125306,...,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0


In [181]:
X_train = pd.concat([X_train.iloc[:,3:].reset_index(drop =True),body_feature_train,subject_feature_train],axis = 1)

In [182]:
X_valid = pd.concat([X_valid.iloc[:,3:].reset_index(drop =True),body_feature_validation,subject_feature_validation],axis = 1)

### Modelling Techniques:
Since this is a classification model we'll perform following techniques
1. Logistics regression
2. Random Forest
3. GBDT
4. Naive Baiyes
5. LSTM

### Model metrics to look
1. precision
2. recall
3. F1-Score

#### Model-1 Logistic Regression Model

In [183]:
from sklearn.linear_model import LogisticRegression

In [184]:
model = LogisticRegression()

In [185]:
%%time
model.fit(X_train,y_train)

CPU times: user 177 ms, sys: 4.15 ms, total: 181 ms
Wall time: 109 ms


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [186]:
train_preds = model.predict(X_train)

In [194]:
from sklearn.metrics import accuracy_score,f1_score,recall_score

In [188]:
#accuracy of our prdictions
print("Accuracy Score of prediction on training data -->",accuracy_score(train_preds,y_train))

Accuracy Score of prediction on training data --> 0.8785046728971962


In [189]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

In [190]:
#try with test(validation) data
val_pred_1 = model.predict(X_valid)

In [191]:
print("Accuracy Score of prediction on validation data-->",accuracy_score(y_valid,val_pred_1))

Accuracy Score of prediction on validation data--> 0.7419354838709677


In [192]:
print("f1 score of prediction on validation data -->",f1_score(y_valid,val_pred_1))

f1 score of prediction on validation data --> 0.8518518518518519


In [193]:
#classificatiopn report for model predictions
from sklearn.metrics import classification_report
report = classification_report(y_valid,val_pred_1)
print(report)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00         8
           1       0.74      1.00      0.85        23

    accuracy                           0.74        31
   macro avg       0.37      0.50      0.43        31
weighted avg       0.55      0.74      0.63        31



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Here we are getting 85% f1 score on validation data set that is good to start with

#### lets check the accuracy on test set

In [197]:
#transform small df into vectors
test_inputs_body = vectorizer.transform(X_test.body)

In [198]:
#transform small df into vectors
test_inputs_subject = vectorizer.transform(X_test.subject)

In [199]:
body_feature_test = pd.DataFrame(test_inputs_body.toarray(), columns=vectorizer.get_feature_names()).reset_index(drop=True)



In [201]:
subject_feature_test = pd.DataFrame(test_inputs_subject.toarray(), columns=vectorizer.get_feature_names()).reset_index(drop=True)



In [202]:
X_test = pd.concat([X_test.iloc[:,3:].reset_index(drop =True),body_feature_test,subject_feature_test],axis = 1)

In [203]:
test_preds = model.predict(X_test)

In [204]:
#accuracy of our prdictions
print("Accuracy Score of prediction on test data -->",accuracy_score(test_preds,y_test))

Accuracy Score of prediction on test data --> 0.5625


In [208]:
print("Recall Score of prediction on test data-->",recall_score(test_preds,y_test))

Recall Score of prediction on test data--> 0.6153846153846154


In [209]:
print("f1 score of prediction on test data -->",f1_score(test_preds,y_test))

f1 score of prediction on test data --> 0.6956521739130435


#### We are getting a recall of 61% On newer test data which is not bad to start with we can improve further using LSTM or BERT

#### Model - 2 Decision Trees

In [211]:
from sklearn import tree

In [228]:
model_2 = tree.DecisionTreeClassifier()

In [229]:
model_2.fit(X_train, y_train)

DecisionTreeClassifier()

In [230]:
train_pred_2 = model_2.predict(X_valid)

In [231]:
#accuracy of our prdictions
print("Accuracy Score of prediction on validation data -->",accuracy_score(train_pred_2,y_valid))

Accuracy Score of prediction on validation data --> 0.5483870967741935


In [232]:
print("Recall Score of prediction on validation data-->",recall_score(train_pred_2,y_valid))

Recall Score of prediction on validation data--> 0.7368421052631579


In [233]:
print("f1 score of prediction on validation data -->",f1_score(train_pred_2,y_valid))

f1 score of prediction on validation data --> 0.6666666666666666


In [234]:
test_pred_2 = model_2.predict(X_test)

In [235]:
#accuracy of our prdictions
print("Accuracy Score of prediction on test data -->",accuracy_score(test_pred_2,y_test))

Accuracy Score of prediction on test data --> 0.625


In [236]:
print("Recall Score of prediction on test data-->",recall_score(test_pred_2,y_test))

Recall Score of prediction on test data--> 0.7


In [237]:
print("f1 score of prediction on test data -->",f1_score(test_pred_2,y_test))

f1 score of prediction on test data --> 0.7


#### Model 3 random Forest Classifier

In [220]:
from sklearn.ensemble import RandomForestClassifier

In [222]:
model_3 = RandomForestClassifier()

In [223]:
model_3.fit(X_train, y_train)

RandomForestClassifier()

In [224]:
train_pred_3 = model_3.predict(X_valid)

In [225]:
#accuracy of our prdictions
print("Accuracy Score of prediction on validation data -->",accuracy_score(train_pred_3,y_valid))

Accuracy Score of prediction on validation data --> 0.7419354838709677


In [226]:
print("Recall Score of prediction on validation data-->",recall_score(train_pred_3,y_valid))

Recall Score of prediction on validation data--> 0.7419354838709677


In [227]:
print("f1 score of prediction on validation data -->",f1_score(train_pred_3,y_valid))

f1 score of prediction on validation data --> 0.8518518518518519


In [238]:
test_pred_3 = model_3.predict(X_test)

In [239]:
#accuracy of our prdictions
print("Accuracy Score of prediction on test data -->",accuracy_score(test_pred_3,y_test))

Accuracy Score of prediction on test data --> 0.625


In [240]:
print("Recall Score of prediction on test data-->",recall_score(test_pred_3,y_test))

Recall Score of prediction on test data--> 0.625


In [241]:
print("f1 score of prediction on test data -->",f1_score(test_pred_3,y_test))

f1 score of prediction on test data --> 0.7692307692307693


## Summary
- Using Logistic Regression and TF - IDF we are getting f1 score of 69% on test data
- Using Decision Trees and TF - IDF we are getting f1 score of 70% on test data
- Using Random Forest and TF - IDF we are getting f1 score of **76%** on test data

## Imporovements
- We can try Fastai with Transformers (BERT, RoBERTa, XLNet, XLM, DistilBERT) to feature encode.
- We can do Hyperparameter tuning using Optuna which uses bayesian optimization to tune parameters.
- Also we can try GBDT and LSTM for better predictions.