In [5]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rule-based-model2/email_labelled.csv
/kaggle/input/rule-based-model2/eamil_labelled.csv
/kaggle/input/rule-based-model2/custom.css
/kaggle/input/rule-based-model2/__notebook__.ipynb
/kaggle/input/rule-based-model2/__results__.html
/kaggle/input/rule-based-model2/__output__.json


In [7]:
df = pd.read_csv("/kaggle/input/rule-based-model2/eamil_labelled.csv")

In [42]:
df['text']=df["text"].map(lambda x:x.replace('"',''))

# Train Test Split

In [89]:
from sklearn.model_selection import train_test_split

In [94]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['target'],stratify=df['target'],test_size=0.20, random_state=42)

# Preprocessing

In [103]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Models 

In [101]:
# Naive Bayes
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()), ])
text_clf.fit(X_train,y_train)
print("Model : Naive Bayes")
print("Train : ", classification_report(y_train,text_clf.predict(X_train)))
print("Validation : ", classification_report(y_test,text_clf.predict(X_test)))
#joblib.dump(pipeline, 'model.joblib')


Model : Naive Bayes
Train :                precision    recall  f1-score   support

           0       0.93      0.98      0.96      2161
           1       0.96      0.83      0.89      1000

    accuracy                           0.94      3161
   macro avg       0.94      0.91      0.92      3161
weighted avg       0.94      0.94      0.94      3161

Validation :                precision    recall  f1-score   support

           0       0.84      0.97      0.90       541
           1       0.91      0.60      0.73       250

    accuracy                           0.86       791
   macro avg       0.88      0.79      0.81       791
weighted avg       0.86      0.86      0.85       791



In [102]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([ ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,                           
                                           max_iter=5, tol=None)),])
text_clf.fit(X_train,y_train)
print("Model : SVM")
print("Train : ", classification_report(y_train,text_clf.predict(X_train)))
print("Validation : ", classification_report(y_test,text_clf.predict(X_test)))


Model : SVM
Train :                precision    recall  f1-score   support

           0       0.92      0.97      0.94      2161
           1       0.92      0.81      0.86      1000

    accuracy                           0.92      3161
   macro avg       0.92      0.89      0.90      3161
weighted avg       0.92      0.92      0.92      3161

Validation :                precision    recall  f1-score   support

           0       0.90      0.95      0.93       541
           1       0.88      0.77      0.82       250

    accuracy                           0.90       791
   macro avg       0.89      0.86      0.87       791
weighted avg       0.89      0.90      0.89       791



In [106]:
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([ ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),])
text_clf.fit(X_train,y_train)
print("Model : SVM")
print("Train : ", classification_report(y_train,text_clf.predict(X_train)))
print("Validation : ", classification_report(y_test,text_clf.predict(X_test)))


Model : SVM
Train :                precision    recall  f1-score   support

           0       0.92      0.98      0.94      2161
           1       0.94      0.81      0.87      1000

    accuracy                           0.92      3161
   macro avg       0.93      0.89      0.91      3161
weighted avg       0.92      0.92      0.92      3161

Validation :                precision    recall  f1-score   support

           0       0.90      0.96      0.93       541
           1       0.89      0.76      0.82       250

    accuracy                           0.90       791
   macro avg       0.90      0.86      0.87       791
weighted avg       0.90      0.90      0.89       791



# LSTM Model

In [1]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional


In [8]:
vocab_size = 5000 
embedding_dim = 64
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' 
training_portion = .8

In [107]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index

In [109]:
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

valid_sequences = tokenizer.texts_to_sequences(X_test)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [110]:
train_labels = tf.keras.utils.to_categorical(
    y_train, num_classes=2, dtype='float32')

valid_labels = tf.keras.utils.to_categorical(y_test,
                                            num_classes=2,
                                            dtype="float32")

In [111]:
model = Sequential()

model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.5))
model.add(Bidirectional(LSTM(embedding_dim)))
model.add(Dense(2, activation='sigmoid'))

model.summary()

Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 64)          320000    
_________________________________________________________________
dropout_4 (Dropout)          (None, None, 64)          0         
_________________________________________________________________
bidirectional_4 (Bidirection (None, 128)               66048     
_________________________________________________________________
dense_4 (Dense)              (None, 2)                 258       
Total params: 386,306
Trainable params: 386,306
Non-trainable params: 0
_________________________________________________________________


In [112]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [114]:
num_epochs = 10
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, 
                    validation_data=(valid_padded, valid_labels), 
                    verbose=2)

Epoch 1/10
99/99 - 14s - loss: 0.5101 - accuracy: 0.7612 - val_loss: 0.3372 - val_accuracy: 0.8799
Epoch 2/10
99/99 - 13s - loss: 0.2334 - accuracy: 0.9117 - val_loss: 0.2208 - val_accuracy: 0.9178
Epoch 3/10
99/99 - 14s - loss: 0.1460 - accuracy: 0.9478 - val_loss: 0.2219 - val_accuracy: 0.9102
Epoch 4/10
99/99 - 14s - loss: 0.0962 - accuracy: 0.9665 - val_loss: 0.2144 - val_accuracy: 0.9166
Epoch 5/10
99/99 - 14s - loss: 0.0629 - accuracy: 0.9801 - val_loss: 0.2511 - val_accuracy: 0.9128
Epoch 6/10
99/99 - 13s - loss: 0.0513 - accuracy: 0.9842 - val_loss: 0.2443 - val_accuracy: 0.9166
Epoch 7/10
99/99 - 14s - loss: 0.0347 - accuracy: 0.9896 - val_loss: 0.2909 - val_accuracy: 0.9027
Epoch 8/10
99/99 - 14s - loss: 0.0283 - accuracy: 0.9918 - val_loss: 0.2975 - val_accuracy: 0.9064
Epoch 9/10
99/99 - 14s - loss: 0.0684 - accuracy: 0.9750 - val_loss: 0.2829 - val_accuracy: 0.8938
Epoch 10/10
99/99 - 14s - loss: 0.0871 - accuracy: 0.9737 - val_loss: 0.2798 - val_accuracy: 0.8951


In [117]:
print(classification_report(y_train,model.predict_classes(train_padded)))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99      2161
           1       0.97      0.99      0.98      1000

    accuracy                           0.99      3161
   macro avg       0.98      0.99      0.99      3161
weighted avg       0.99      0.99      0.99      3161



In [118]:
print(classification_report(y_test,model.predict_classes(valid_padded)))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92       541
           1       0.82      0.86      0.84       250

    accuracy                           0.90       791
   macro avg       0.88      0.88      0.88       791
weighted avg       0.90      0.90      0.90       791

