# Dependencies

In [None]:
 import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from keras.regularizers import l2
import nltk

# Read File

In [None]:
df = pd.read_csv('your file name')

# Model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D, BatchNormalization, GlobalMaxPooling1D, Conv1D, MaxPooling1D
from keras.optimizers import Adam

# Define your feature (X) and target (y)
X = df[' Your X column values']
y = [labels.split(', ') for labels in df['Your Y column values']]  # Assumes that labels are separated by ', '

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create a MultiLabelBinarizer
mlb1 = MultiLabelBinarizer()

# Transform the target labels into a binary format
y_train_bin = mlb1.fit_transform(y_train)
y_test_bin = mlb1.transform(y_test)

# Tokenize the text data
max_words = 10000  # You can adjust this value
tokenizer1 = Tokenizer(num_words=max_words)
tokenizer1.fit_on_texts(X_train)

X_train_seq = tokenizer1.texts_to_sequences(X_train)
X_test_seq = tokenizer1.texts_to_sequences(X_test)

max_sequence_length = 200  # You can adjust this value
X_train_padded = pad_sequences(X_train_seq, maxlen=max_sequence_length)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_sequence_length)

# Create a more complex deep learning model
model1 = Sequential()
model1.add(Embedding(max_words, 256, input_length=max_sequence_length))
model1.add(SpatialDropout1D(0.2))  # Spatial dropout for embeddings
model1.add(Conv1D(128, 5, activation='relu'))  # Convolutional layer
model1.add(MaxPooling1D(5))  # Max-pooling layer
model1.add(Bidirectional(LSTM(128, return_sequences=True)))
model1.add(SpatialDropout1D(0.2))
model1.add(Conv1D(64, 5, activation='relu'))  # Additional convolutional layer
model1.add(MaxPooling1D(5))
model1.add(Bidirectional(LSTM(64, return_sequences=True)))
model1.add(GlobalMaxPooling1D())  # Pooling layer for sequence data
model1.add(Dense(256, activation='relu'))
model1.add(BatchNormalization())  # Batch normalization for improved training stability
model1.add(Dropout(0.5))
model1.add(Dense(len(mlb1.classes_), activation='sigmoid'))

# Compile the model
optimizer = Adam(learning_rate=0.005)
model1.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

# Train the model
model1.fit(X_train_padded, y_train_bin, epochs=300, batch_size=64)

# Make multi-label predictions on the test set
y_pred_bin = model1.predict(X_test_padded)

# Apply a threshold to convert probabilities to binary labels
threshold = 0.7
y_pred_bin = (y_pred_bin > threshold).astype(int)

# Calculate accuracy and provide a classification report
accuracy = accuracy_score(y_test_bin, y_pred_bin)
classification_rep = classification_report(y_test_bin, y_pred_bin)

print(f"Accuracy: {accuracy}")

# Calculate precision, recall, and F1-score
precision = precision_score(y_test_bin, y_pred_bin, average='weighted')
recall = recall_score(y_test_bin, y_pred_bin, average='weighted')
f1 = f1_score(y_test_bin, y_pred_bin, average='weighted')
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {f1}")


# Print the classification report
classification_rep = classification_report(y_test_bin, y_pred_bin, target_names=mlb1.classes_)
print("Classification Report:")
print(classification_rep)


# Saving the model

In [None]:
model1.save("model5.h5")

import pickle
with open("tokenizer.pkl", "wb") as tokenizer_file:
    pickle.dump(tokenizer1, tokenizer_file)


# Save MultiLabelBinarizer
joblib.dump(mlb1, "mlb5.pkl")

# Testing the model

In [None]:
import rec
import string
import nltk
import numpy as np
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import model_from_json
import json
import joblib
from keras.models import load_model
# Load the saved model


# Load the saved model
loaded_model = load_model("model5.h5")

with open("tokenizer.pkl", "rb") as tokenizer_file:
    loaded_tokenizer = pickle.load(tokenizer_file)

# Load MultiLabelBinarizer (if needed)
loaded_mlb = joblib.load("mlb5.pkl")

# Function for preprocessing text
def text_preprocessing(text):
    # Remove non-ASCII characters
    text = ''.join([c if ord(c) < 128 else ' ' for c in text])

    # Remove Roman numerals using a regular expression
    text = re.sub(r'\b[IVXLCDM]+(?:\s+[ivxlcdm]+)?\b', '', text)  # Remove lowercase Roman numerals

    # Convert to lowercase
    text = text.lower()
    text = text.replace('“', '')
    text = text.replace('”', '')
    text = text.replace('’', '')
    text = text.replace('\n', ' ').replace('-', ' ')
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub(r',+', ',  ,', text)

    # Tokenize the text and remove stopwords
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words('english')]
    text = ' '.join(words)

    return text

# Function to predict labels for the provided text
def predict_labels_for_text(user_text, threshold=0.7):

    # Preprocess the input text
    preprocessed_text = text_preprocessing(user_text)
    text_sequence = loaded_tokenizer.texts_to_sequences([preprocessed_text])
    max_sequence_length = 200  # You can adjust this value
    padded_sequence = pad_sequences(text_sequence, maxlen=max_sequence_length)

    # Predict labels for the text
    text_labels_bin = loaded_model.predict(padded_sequence)
    text_labels = loaded_mlb.inverse_transform(text_labels_bin > threshold)

    return text_labels

user_input_text = input("Enter text: \n")

# Predict labels for the user-provided text
predicted_labels = predict_labels_for_text(user_input_text)

# Extract label strings from tuples and display the predicted labels
predicted_label_strings = [', '.join(label_tuple) for label_tuple in predicted_labels]
print(f"Predicted Labels: {', '.join(predicted_label_strings)}")
