In [11]:
import json 
def fix_invalid_json(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
        # Start the JSON array
        outfile.write('[')
        
        first_object = True
        for line in infile:
            line = line.strip()
            if not line:  # Skip empty lines
                continue

            if first_object:
                first_object = False
            else:
                outfile.write(',')
            
            # Ensure each line is a valid JSON object and write it
            outfile.write(line)
        
        # End the JSON array
        outfile.write(']')

    # Validate the newly created JSON file
    try:
        with open(output_file, 'r') as f:
            datastore = json.load(f)
        print("JSON file has been corrected and loaded successfully.")
        return datastore
    except json.JSONDecodeError as e:
        raise ValueError(f"Error decoding JSON: {e}")

# Usage
input_file = 'sarcasm.json'
output_file = 'sarcasm_corrected.json'
datastore = fix_invalid_json(input_file, output_file)

# Example processing of the data if it loaded successfully
if datastore:
    sentences = [item.get('headline') for item in datastore]
    labels = [item.get('is_sarcastic') for item in datastore]
    urls = [item.get('article_link') for item in datastore]

    print("Sentences:", sentences)
    print("Labels:", labels)
    print("URLs:", urls)


JSON file has been corrected and loaded successfully.
Labels: [0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0

In [38]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np

vocab_size = 50000  # Example value, adjust based on your needs
embedding_dim = 16  # Example value, adjust based on your needs
max_length = 100  # Example value, adjust based on your needs
padding_type = 'post'
trunc_type = 'post'

training_sentences, testing_sentences, training_labels, testing_labels = train_test_split(
        sentences, labels, test_size=0.2, random_state=42
    )


tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)
word_index = tokenizer.word_index

sequences = tokenizer.texts_to_sequences(sentences)
padded = pad_sequences(sequences,padding ='post')
print(padded[0])
print(padded.shape)

[  308 15115   679  3337  2298    48   382  2576 15116     6  2577  8434
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0]
(26709, 40)


In [34]:
training_labels = np.array(training_labels)
testing_labels = np.array(testing_labels)

training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding= padding_type, truncating=trunc_type)


testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding= padding_type, truncating=trunc_type)



In [35]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation='relu'),
    tf.keras.layers.Dense(1, activation='sigmoid')

])

model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics= ['accuracy'])



In [36]:
training_sequences = tokenizer.texts_to_sequences(training_sentences)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [42]:
num_epochs = 30

history = model.fit(
        training_padded, training_labels,
        epochs=num_epochs,
        validation_data=(testing_padded, testing_labels),
        verbose=2
    )

ValueError: Unrecognized data type: x=[[2618  393  761 ...    0    0    0]
 [ 668 3876    2 ...    0    0    0]
 [  43 9562  173 ...    0    0    0]
 ...
 [1094  331   28 ...    0    0    0]
 [1384  164 6812 ...    0    0    0]
 [1163 1801   20 ...    0    0    0]] (of type <class 'numpy.ndarray'>)

In [30]:
sentence = [ "i found that very super funny"]

sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding = padding_type, truncating=trunc_type)

print(model.predict(padded))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 541ms/step
[[0.49561256]]
