In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/rule-based-model2/__results__.html
/kaggle/input/rule-based-model2/email_labelled.csv
/kaggle/input/rule-based-model2/eamil_labelled.csv
/kaggle/input/rule-based-model2/custom.css
/kaggle/input/rule-based-model2/__notebook__.ipynb
/kaggle/input/rule-based-model2/__output__.json


In [2]:
df = pd.read_csv("/kaggle/input/rule-based-model2/eamil_labelled.csv")

In [3]:
!pip install texthero


Collecting texthero
  Downloading texthero-1.0.9-py3-none-any.whl (25 kB)
Collecting nltk>=3.3
  Downloading nltk-3.5.zip (1.4 MB)
[K     |████████████████████████████████| 1.4 MB 263 kB/s 
Building wheels for collected packages: nltk
  Building wheel for nltk (setup.py) ... [?25l- \ | / - \ | done
[?25h  Created wheel for nltk: filename=nltk-3.5-py3-none-any.whl size=1434672 sha256=5ec90d2d377820dab047bc47928a41e23ffc3ca41cf4a86df117ff1b508fa83b
  Stored in directory: /root/.cache/pip/wheels/45/6c/46/a1865e7ba706b3817f5d1b2ff7ce8996aabdd0d03d47ba0266
Successfully built nltk
Installing collected packages: nltk, texthero
  Attempting uninstall: nltk
    Found existing installation: nltk 3.2.4
    Uninstalling nltk-3.2.4:
      Successfully uninstalled nltk-3.2.4
[31mERROR: After October 2020 you may experience errors when installing or updating packages. This is because pip will change the way that it resolves dependency conflicts.

We recommend you us

In [4]:
import texthero as hero

In [5]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

def lemmatize(sentence):
    result = []
    for word in sentence.split():
        word = lemmatizer.lemmatize(word)
        result.append(word)
    return " ".join(result)


def preprocess(sentence):
    sentence = sentence.replace('"','')
    return sentence
    

In [6]:
df['text'] = df['text'].pipe(hero.clean) # remove digits, punctuation, diacritics,stopwords, whitespace
df['text'] = df['text'].map(lambda x:preprocess(x))
df['text'] = df['text'].map(lambda x:lemmatize(x))

# Train Test Split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train, X_test, y_train, y_test = train_test_split(df['text'],df['target'],stratify=df['target'],test_size=0.20, random_state=42)

# Preprocessing

In [9]:
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

# Models 

In [10]:
# Naive Bayes
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()), ])
text_clf.fit(X_train,y_train)
print("Model : Naive Bayes")
print("Train : ", classification_report(y_train,text_clf.predict(X_train)))
print("Validation : ", classification_report(y_test,text_clf.predict(X_test)))
#joblib.dump(pipeline, 'model.joblib')


Model : Naive Bayes
Train :                precision    recall  f1-score   support

           0       0.93      0.98      0.95      2161
           1       0.95      0.84      0.89      1000

    accuracy                           0.94      3161
   macro avg       0.94      0.91      0.92      3161
weighted avg       0.94      0.94      0.93      3161

Validation :                precision    recall  f1-score   support

           0       0.84      0.94      0.89       541
           1       0.83      0.62      0.71       250

    accuracy                           0.84       791
   macro avg       0.84      0.78      0.80       791
weighted avg       0.84      0.84      0.83       791



In [11]:
from sklearn.linear_model import SGDClassifier
text_clf = Pipeline([ ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', SGDClassifier(loss='hinge', penalty='l2',
                                           alpha=1e-3, random_state=42,                           
                                           max_iter=5, tol=None)),])
text_clf.fit(X_train,y_train)
print("Model : SVM")
print("Train : ", classification_report(y_train,text_clf.predict(X_train)))
print("Validation : ", classification_report(y_test,text_clf.predict(X_test)))


Model : SVM
Train :                precision    recall  f1-score   support

           0       0.90      0.96      0.93      2161
           1       0.91      0.76      0.83      1000

    accuracy                           0.90      3161
   macro avg       0.90      0.86      0.88      3161
weighted avg       0.90      0.90      0.90      3161

Validation :                precision    recall  f1-score   support

           0       0.88      0.95      0.92       541
           1       0.87      0.73      0.79       250

    accuracy                           0.88       791
   macro avg       0.88      0.84      0.85       791
weighted avg       0.88      0.88      0.88       791



In [12]:
from sklearn.linear_model import LogisticRegression
text_clf = Pipeline([ ('vect', CountVectorizer()),
                     ('tfidf', TfidfTransformer()),
                     ('clf', LogisticRegression()),])
text_clf.fit(X_train,y_train)
print("Model : LogReg")
print("Train : ", classification_report(y_train,text_clf.predict(X_train)))
print("Validation : ", classification_report(y_test,text_clf.predict(X_test)))


Model : LogReg
Train :                precision    recall  f1-score   support

           0       0.89      0.97      0.93      2161
           1       0.93      0.75      0.83      1000

    accuracy                           0.90      3161
   macro avg       0.91      0.86      0.88      3161
weighted avg       0.90      0.90      0.90      3161

Validation :                precision    recall  f1-score   support

           0       0.88      0.96      0.92       541
           1       0.89      0.71      0.79       250

    accuracy                           0.88       791
   macro avg       0.88      0.83      0.85       791
weighted avg       0.88      0.88      0.87       791



# LSTM Model

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Flatten, LSTM, Dropout, Activation, Embedding, Bidirectional


In [14]:
vocab_size = 5000 
embedding_dim = 64
lstm_hidden_units = 128
max_length = 200
trunc_type = 'post'
padding_type = 'post'
oov_tok = '<OOV>' 
training_portion = .8

In [15]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(df['text'])
word_index = tokenizer.word_index

In [16]:
train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

valid_sequences = tokenizer.texts_to_sequences(X_test)
valid_padded = pad_sequences(valid_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [17]:
train_labels = tf.keras.utils.to_categorical(
    y_train, num_classes=2, dtype='float32')

valid_labels = tf.keras.utils.to_categorical(y_test,
                                            num_classes=2,
                                            dtype="float32")

In [18]:
model = Sequential()

model.add(Embedding(vocab_size, embedding_dim))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(lstm_hidden_units)))
model.add(Dense(2, activation='sigmoid'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 64)          320000    
_________________________________________________________________
dropout (Dropout)            (None, None, 64)          0         
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               197632    
_________________________________________________________________
dense (Dense)                (None, 2)                 514       
Total params: 518,146
Trainable params: 518,146
Non-trainable params: 0
_________________________________________________________________


In [19]:
opt = tf.keras.optimizers.Adam(lr=0.001, decay=1e-6)
model.compile(
    loss='binary_crossentropy',
    optimizer=opt,
    metrics=['accuracy'],
)

In [20]:
num_epochs = 2
history = model.fit(train_padded, train_labels, 
                    epochs=num_epochs, 
                    validation_data=(valid_padded, valid_labels), 
                    verbose=2)

Epoch 1/2
99/99 - 24s - loss: 0.5036 - accuracy: 0.7574 - val_loss: 0.3024 - val_accuracy: 0.8824
Epoch 2/2
99/99 - 23s - loss: 0.2318 - accuracy: 0.9184 - val_loss: 0.2768 - val_accuracy: 0.8862


In [21]:
print(classification_report(y_train,model.predict_classes(train_padded)))

              precision    recall  f1-score   support

           0       0.96      0.97      0.96      2161
           1       0.93      0.92      0.92      1000

    accuracy                           0.95      3161
   macro avg       0.94      0.94      0.94      3161
weighted avg       0.95      0.95      0.95      3161



In [22]:
print(classification_report(y_test,model.predict_classes(valid_padded)))

              precision    recall  f1-score   support

           0       0.91      0.93      0.92       541
           1       0.83      0.80      0.82       250

    accuracy                           0.89       791
   macro avg       0.87      0.86      0.87       791
weighted avg       0.89      0.89      0.89       791

