**2nd code**

In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Flatten, Concatenate, Dense, Dropout
from sklearn.model_selection import train_test_split

# Load and preprocess the dataset
def load_data(MMM):
    data = pd.read_csv('MMM.csv')
    texts = data['Text'].tolist()
    labels = data['Catchphrases'].tolist()
    return texts, labels

def preprocess_data(texts, labels, max_words, max_sequence_length, test_size=0.2):
    tokenizer = Tokenizer(num_words=max_words)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=max_sequence_length)
    labels = np.asarray(labels)
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=test_size, random_state=42)
    print('Number of training samples:', len(x_train))
    print('Number of test samples:', len(x_test))

    return x_train, x_test, y_train, y_test, word_index

# Load and preprocess data
texts, labels = load_data('MMM.csv')

# Use label encoding to convert text labels to numerical format
label_encoder = LabelEncoder()
labels = label_encoder.fit_transform(labels)

max_words = 10000
max_sequence_length = 1000
x_train, x_test, y_train, y_test, word_index = preprocess_data(texts, labels, max_words, max_sequence_length)


# Define model parameters
embedding_dim = 100
num_filters = 128
filter_sizes = [3, 4, 5]
dropout_rate = 0.5
def build_model(word_index, max_sequence_length, embedding_dim, num_filters, filter_sizes, dropout_rate):
    input_layer = Input(shape=(max_sequence_length,))
    embedding_layer = Embedding(len(word_index) + 1, embedding_dim, input_length=max_sequence_length)(input_layer)

    conv_blocks = []
    for filter_size in filter_sizes:
        conv = Conv1D(filters=num_filters,
                      kernel_size=filter_size,
                      padding='same',  # Add padding here
                      activation='relu',
                      strides=1)(embedding_layer)
        conv = MaxPooling1D(pool_size=2)(conv)
        conv_blocks.append(conv)

    concat = Concatenate(axis=-1)(conv_blocks)  # Concatenate along the last dimension
    flatten = Flatten()(concat)
    dropout = Dropout(dropout_rate)(flatten)
    output_layer = Dense(1, activation='sigmoid')(dropout)

    model = Model(inputs=input_layer, outputs=output_layer)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Build and compile the model
model = build_model(word_index, max_sequence_length, embedding_dim, num_filters, filter_sizes, dropout_rate)

# Train the model
epochs = 10
batch_size = 64
model.fit(x_train, y_train, epochs=epochs, batch_size=batch_size, validation_data=(x_test, y_test))

# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test)
print("Test Accuracy:", accuracy)


Found 14442 unique tokens.
Shape of data tensor: (100, 1000)
Shape of label tensor: (100,)
Number of training samples: 80
Number of test samples: 20
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Accuracy: 0.05000000074505806


In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
import pickle
import pandas as pd

# Load your dataset
data = pd.read_csv('MMM.csv')

# Extract text from "Text" column
texts = data['Text'].tolist()

# Create tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(texts)

# Save tokenizer to file
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)


In [None]:
# Make predictions on the input text file
input_file = 'Document_4[1].txt'
with open(input_file, 'r') as f:
    input_texts = f.readlines()

input_texts = [text.strip() for text in input_texts]
input_sequences = tokenizer.texts_to_sequences(input_texts)
input_data = pad_sequences(input_sequences, maxlen=max_sequence_length)

# Make predictions
predictions = model.predict(input_data)

# Convert predictions to binary format
predictions = np.round(predictions)

print('Predictions:', predictions)

Predictions: [[1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1.]
 [1