In [33]:
import nltk
import pandas as pd
import numpy as np
import re
from bs4 import BeautifulSoup
import unicodedata
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from sklearn.model_selection import train_test_split
from keras.layers import LSTM, Dropout, Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import RMSprop
from keras.losses import mean_squared_error
from keras_tuner.tuners import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters
import tensorflow as tf
from sklearn.metrics import confusion_matrix, classification_report
import os

In [34]:
def datapipeline(pathfile):
    df=pd.read_csv(pathfile)
    df.drop_duplicates(subset="headline",
                     keep='last', inplace=True)
    print(df.head())
    return df


In [35]:
filepath='/Users/rianrachmanto/pypro/project/sarcastic_detection/data/Train_Data.csv'
df=datapipeline(filepath)

                                             headline  is_sarcastic
8   report: there still time to convert to christi...             1
9                       education reform and evidence             0
15                         the new new net neutrality             0
27  confused zoo officials awkwardly celebrate aft...             1
28  lauren graham just dropped a clue about those ...             0


In [36]:
class TextPreprocessor:
    def __init__(self):
        self.stop = set(stopwords.words('english'))
        punctuation = list(string.punctuation)
        self.stop.update(punctuation)

    def strip_html(self, text):
        soup = BeautifulSoup(text, "html.parser")
        return soup.get_text()

    def remove_between_square_brackets(self, text):
        return re.sub(r'\[[^]]*\]', '', text)

    def remove_urls(self, text):
        return re.sub(r'http\S+', '', text)

    def remove_stopwords(self, text):
        final_text = []
        for word in text.split():
            if word.strip().lower() not in self.stop:
                final_text.append(word.strip())
        return " ".join(final_text)

    def remove_accented_chars(self, text):
        return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')

    def remove_punctuation(self, text):
        return re.sub(r'[^a-zA-Z0-9]', ' ', text)

    def remove_irrelevant_chars(self, text):
        return re.sub(r'[^a-zA-Z]', ' ', text)

    def remove_extra_whitespaces(self, text):
        return re.sub(r'^\s*|\s\s*', ' ', text).strip()
    
    def lemmatize_words(self,text):
        lemmatizer = WordNetLemmatizer()
        words = text.split()
        words = [lemmatizer.lemmatize(word,pos='v') for word in words]
        return ' '.join(words)

    def preprocess_text(self, text):
        text = self.strip_html(text)
        text = self.remove_between_square_brackets(text)
        text = self.remove_urls(text)
        text = self.remove_stopwords(text)
        text = self.remove_accented_chars(text)
        text = self.remove_punctuation(text)
        text = self.remove_irrelevant_chars(text)
        text = self.remove_extra_whitespaces(text)
        return text

In [37]:
preprocessor=TextPreprocessor()
df['headline']=df['headline'].apply(lambda x:preprocessor.preprocess_text(x))
print(df.head())

  soup = BeautifulSoup(text, "html.parser")


                                             headline  is_sarcastic
8   report still time convert christianity christm...             1
9                           education reform evidence             0
15                             new new net neutrality             0
27  confused zoo officials awkwardly celebrate end...             1
28  lauren graham dropped clue final gilmore girls...             0


In [38]:
#create function to tokenize, pad, split data to X_train, X_test, y_train, y_test
def tokenize_pad_split(df):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['headline'])
    X = tokenizer.texts_to_sequences(df['headline'])
    X = pad_sequences(X, maxlen=100)
    y = df['is_sarcastic']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    return X_train, X_test, y_train, y_test

In [39]:
X_train, X_test, y_train, y_test=tokenize_pad_split(df)

In [44]:
from keras.layers import LSTM, Dropout, Dense
from keras.models import Sequential
from keras.callbacks import EarlyStopping
from keras.optimizers import RMSprop
from keras.losses import mean_squared_error
from keras_tuner.tuners import RandomSearch
from keras_tuner.engine.hyperparameters import HyperParameters
from tensorflow import keras

In [48]:


class CustomModelTrainer:
    def __init__(self):
        pass

    @staticmethod
    def build_model(hp):
        with tf.device('/cpu:0'):
            model = Sequential()
            model.add(LSTM(hp.Int('units',min_value=32,max_value=512,step=32), input_shape=((X_train.shape[1], 1))))
            model.add(Dense(1))
            model.compile(loss='mse', optimizer='adam',metrics = [tf.keras.metrics.MeanSquaredError()])
            return model

    def tune_hyperparameters(self, X_train, y_train):
        tuner = RandomSearch(
            self.build_model,
            objective='mean_squared_error',
            max_trials=10,  # Adjust the number of trials as needed
            directory='keras_tuner',  # Directory to store logs and results
            project_name='custom_model'
        )

        # Define a callback to stop training early if necessary
        stop_early = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

        tuner.search(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2, callbacks=[stop_early])

        # Get the best hyperparameters
        best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]

        return best_hps

    def train_best_model(self, X_train, y_train, X_test, y_test):
        best_hps = self.tune_hyperparameters(X_train, y_train)

        # Build the best model with the tuned hyperparameters
        best_model = self.build_model(best_hps)

        # Train the best model
        best_model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

        # Evaluate the best model
        self.evaluate_model(best_model, X_test, y_test, save_path="/Users/rianrachmanto/pypro/project/sarcastic_detection/model")
    
    @staticmethod
    def evaluate_model(model, X_test, y_test, save_path=None):
        y_pred = model.predict(X_test)
        y_pred = (y_pred > 0.5)  # Adjust the threshold as needed
        print(confusion_matrix(y_test, y_pred))
        print(classification_report(y_test, y_pred))

        if save_path:
            os.makedirs(save_path, exist_ok=True)
            model.save(os.path.join(save_path, 'trained_model.h5'))


In [49]:
trainer = CustomModelTrainer()


In [50]:
best_hps = trainer.tune_hyperparameters(X_train, y_train)


Trial 10 Complete [00h 02m 14s]
mean_squared_error: 0.22470971941947937

Best mean_squared_error So Far: 0.22470971941947937
Total elapsed time: 00h 34m 59s
INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


In [51]:
trainer.train_best_model(X_train, y_train, X_test, y_test)


INFO:tensorflow:Reloading Tuner from keras_tuner/custom_model/tuner0.json


INFO:tensorflow:Reloading Tuner from keras_tuner/custom_model/tuner0.json


INFO:tensorflow:Oracle triggered exit


INFO:tensorflow:Oracle triggered exit


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
[[2449  426]
 [1741  796]]
              precision    recall  f1-score   support

           0       0.58      0.85      0.69      2875
           1       0.65      0.31      0.42      2537

    accuracy                           0.60      5412
   macro avg       0.62      0.58      0.56      5412
weighted avg       0.62      0.60      0.57      5412

