In [6]:
import tensorflow as tf
from tensorflow.keras import layers,Sequential

In [None]:
model = tf.keras.Sequential(
    layers.Embedding(input_dim=1000,output_dim=64),
    layers.LSTM(128),
    layers.Dense(2,activation='softmax')
)

In [7]:
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import conll2000

# Download NLTK resources (if not already downloaded)
import nltk
nltk.download('punkt')
nltk.download('conll2000')

# Load CoNLL-2012 Shared Task dataset (this is just an example, use your dataset)
def load_conll_data():
    # Load the CoNLL-2012 Shared Task dataset
    dataset = conll2000.chunked_sents()

    # Preprocess the data (convert to features and labels)
    sentences = []
    labels = []
    for sentence in dataset:
        words, tags = zip(*[(word, tag) for word, tag in tree2conlltags(sentence)])
        sentences.append(" ".join(words))
        labels.append(list(tags))

    return sentences, labels

# Tokenize and preprocess text data
def preprocess_data(sentences, labels):
    tokenized_sentences = [word_tokenize(sentence) for sentence in sentences]
    pos_tags = [pos_tag(sentence) for sentence in tokenized_sentences]
    
    # Encode POS tags using LabelEncoder
    pos_encoder = LabelEncoder()
    encoded_pos_tags = [pos_encoder.fit_transform([tag for _, tag in pos_sentence]) for pos_sentence in pos_tags]
    
    # Flatten the labels (assuming you have a binary classification task)
    flat_labels = [label for sublist in labels for label in sublist]
    
    # Encode labels using LabelEncoder
    label_encoder = LabelEncoder()
    encoded_labels = label_encoder.fit_transform(flat_labels)
    one_hot_labels = to_categorical(encoded_labels)

    return tokenized_sentences, encoded_pos_tags, one_hot_labels

# Split the data into training and testing sets
def split_data(sentences, pos_tags, labels):
    X_train, X_test, pos_train, pos_test, y_train, y_test = train_test_split(
        sentences, pos_tags, labels, test_size=0.2, random_state=42
    )
    return X_train, X_test, pos_train, pos_test, y_train, y_test

# Build the model
def build_model(input_dim, pos_dim, output_dim):
    model = tf.keras.Sequential([
        layers.Embedding(input_dim=input_dim, output_dim=64),
        layers.LSTM(128, return_sequences=True),
        layers.Dropout(0.5),
        layers.TimeDistributed(layers.Dense(64, activation='relu')),
        layers.Flatten(),
        layers.Dense(32, activation='relu'),
        layers.Dense(output_dim, activation='softmax')
    ])

    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
    return model

# Load and preprocess data
sentences, labels = load_conll_data()
tokenized_sentences, encoded_pos_tags, one_hot_labels = preprocess_data(sentences, labels)

# Split the data
X_train, X_test, pos_train, pos_test, y_train, y_test = split_data(
    tokenized_sentences, encoded_pos_tags, one_hot_labels
)

# Build the model
input_dim = 1000  # Choose an appropriate vocabulary size based on your dataset
pos_dim = len(set([pos for sublist in encoded_pos_tags for pos in sublist]))
output_dim = len(set([label for sublist in labels for label in sublist]))

model = build_model(input_dim, pos_dim, output_dim)

# Train the model
model.fit([X_train, pos_train], y_train, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
evaluation = model.evaluate([X_test, pos_test], y_test)
print("Test Accuracy:", evaluation[1])


[nltk_data] Downloading package punkt to C:\Users\Srikrishna U
[nltk_data]     N\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package conll2000 to C:\Users\Srikrishna U
[nltk_data]     N\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\conll2000.zip.


NameError: name 'tree2conlltags' is not defined