<a href="https://colab.research.google.com/github/Sanjeda039/Biodata_Laravel/blob/main/svm-lstm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import re
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Embedding, Bidirectional, LSTM, Dense, Input
from keras.callbacks import ModelCheckpoint, EarlyStopping
from keras.utils import to_categorical
from imblearn.over_sampling import RandomOverSampler
from sklearn.preprocessing import LabelEncoder  # Import the LabelEncoder

def clean_str(string):
    string = re.sub(r"\\", "", string)
    string = re.sub(r"\'", "", string)
    string = re.sub(r"\"", "", string)
    return string.strip().lower()

MAX_SEQUENCE_LENGTH = 1000
MAX_NB_WORDS = 20000
EMBEDDING_DIM = 100
VALIDATION_SPLIT = 0.2

# Load the preprocessed dataset
df = pd.read_csv('normalized_dataset.csv')

# Oversample the minority classes
oversampler = RandomOverSampler(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = oversampler.fit_resample(df['normalized_requirements'].values.reshape(-1, 1), df['Type'])

# Create a new DataFrame with balanced data
df_balanced = pd.DataFrame({'Type': y_resampled, 'normalized_requirements': X_resampled.flatten()})

df_balanced = df_balanced.dropna()
df_balanced = df_balanced.reset_index(drop=True)

print('Shape of dataset ', df_balanced.shape)
print(df_balanced.columns)
print('No. of unique classes', len(set(df_balanced['Type'])))
macronum = sorted(set(df_balanced['Type']))
macro_to_id = dict((note, number) for number, note in enumerate(macronum))

def fun(i):
    return macro_to_id[i]

df_balanced['Type'] = df_balanced['Type'].apply(fun)
texts = []
labels = []

for idx in range(df_balanced.normalized_requirements.shape[0]):
    text = BeautifulSoup(df_balanced.normalized_requirements[idx])
    texts.append(clean_str(str(text.get_text().encode())))

for idx in df_balanced['Type']:
    labels.append(idx)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

print('Number of Unique Tokens', len(word_index))
data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

labels = to_categorical(np.asarray(labels))
print('Shape of Data Tensor:', data.shape)
print('Shape of Label Tensor:', labels.shape)

# Initialize LabelEncoder
label_encoder = LabelEncoder()

# Fit the LabelEncoder on the original labels
original_labels = df['Type']
label_encoder.fit(original_labels)

skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

fold_count = 0
for train_index, test_index in skf.split(data, np.argmax(labels, axis=1)):
    fold_count += 1
    print(f"\nFold {fold_count}")

    x_train, x_test, y_train, y_test = train_test_split(data, labels, test_size=0.2, stratify=labels, random_state=42)
    x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, stratify=y_train, random_state=42)




    embeddings_index = {}
    with open('glove.6B.100d.txt', encoding='utf8') as f:
        for line in f:
            if line.startswith('#') or line.startswith('%'):
                continue  # Skip header lines
            if not line.strip():
                continue  # Skip empty lines
            values = line.split()
            word = values[0]
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs

    print('Total %s word vectors in Glove 6B 100d.' % len(embeddings_index))
    embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    embedding_layer = Embedding(len(word_index) + 1,
                                EMBEDDING_DIM,
                                weights=[embedding_matrix],
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=True)

    sequence_input = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
    embedded_sequences = embedding_layer(sequence_input)
    l_lstm = Bidirectional(LSTM(100))(embedded_sequences)
    dense_layer = Dense(50, activation='relu')(l_lstm)  # LSTM output connected to a dense layer
    svm_output = Dense(len(macronum), activation='linear')(dense_layer)  # SVM output
    model = Model(sequence_input, svm_output)
    model.compile(loss='hinge',  # Hinge loss for SVM
                  optimizer='rmsprop',
                  metrics=['accuracy'])

    print("Hybrid LSTM-SVM Model")
    model.summary()

    # Add EarlyStopping callback
    early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, restore_best_weights=True)

    cp = ModelCheckpoint(f'model_rnn_fold_{fold_count}.hdf5', monitor='val_accuracy', verbose=1, save_best_only=True)
    history = model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=15, batch_size=2, callbacks=[cp, early_stopping])

    # Use the trained model to make predictions on the test set
    y_pred = model.predict(x_test)
    y_pred_class = np.argmax(y_pred, axis=1)
    y_test_class = np.argmax(y_test, axis=1)


    # Convert class indices back to original labels
    y_test_original = label_encoder.inverse_transform(y_test_class)
    y_pred_original = label_encoder.inverse_transform(y_pred_class)



    # Evaluate the performance on the test set with original labels
    print("\nFold Results:")
    print(classification_report(y_test_original, y_pred_original))



    # Print training, validation, and testing accuracy, precision, recall, and F1 score
    train_metrics = model.evaluate(x_train, y_train, verbose=0)
    val_metrics = model.evaluate(x_val, y_val, verbose=0)
    test_metrics = model.evaluate(x_test, y_test, verbose=0)

    print(f"\nTraining Metrics: Accuracy: {train_metrics[1]}")
    print(f"Validation Metrics: Accuracy: {val_metrics[1]}")
    print(f"Testing Metrics: Accuracy: {test_metrics[1]}")

    print("\nTraining Confusion Matrix:")
    print(confusion_matrix(np.argmax(y_train, axis=1), np.argmax(model.predict(x_train), axis=1)))

    print("\nValidation Confusion Matrix:")
    print(confusion_matrix(np.argmax(y_val, axis=1), np.argmax(model.predict(x_val), axis=1)))

    print("\nTesting Confusion Matrix:")
    print(confusion_matrix(np.argmax(y_test, axis=1), np.argmax(y_pred, axis=1)))


Shape of dataset  (6252, 2)
Index(['Type', 'normalized_requirements'], dtype='object')
No. of unique classes 12
Number of Unique Tokens 1554
Shape of Data Tensor: (6252, 1000)
Shape of Label Tensor: (6252, 12)

Fold 1
Total 4894 word vectors in Glove 6B 100d.
Hybrid LSTM-SVM Model
Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 1000)]            0         
                                                                 
 embedding (Embedding)       (None, 1000, 100)         155500    
                                                                 
 bidirectional (Bidirection  (None, 200)               160800    
 al)                                                             
                                                                 
 dense (Dense)               (None, 50)                10050     
                                           

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.77058 to 0.90967, saving model to model_rnn_fold_1.hdf5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.90967 to 0.94005, saving model to model_rnn_fold_1.hdf5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.94005 to 0.97362, saving model to model_rnn_fold_1.hdf5
Epoch 5/15
Epoch 5: val_accuracy did not improve from 0.97362
Epoch 6/15
Epoch 6: val_accuracy improved from 0.97362 to 0.98082, saving model to model_rnn_fold_1.hdf5
Epoch 7/15
Epoch 7: val_accuracy improved from 0.98082 to 0.98161, saving model to model_rnn_fold_1.hdf5
Epoch 8/15
Epoch 8: val_accuracy did not improve from 0.98161
Epoch 9/15
Epoch 9: val_accuracy improved from 0.98161 to 0.98241, saving model to model_rnn_fold_1.hdf5
Epoch 10/15
Epoch 10: val_accuracy did not improve from 0.98241
Epoch 11/15
Epoch 11: val_accuracy improved from 0.98241 to 0.98481, saving model to model_rnn_fold_1.hdf5
Epoch 12/15
Epoch 12: val_accuracy did not improve from 0.98481
Epoch 13/15
Epoch 13: 

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.83693 to 0.95124, saving model to model_rnn_fold_2.hdf5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.95124 to 0.96723, saving model to model_rnn_fold_2.hdf5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.96723 to 0.98161, saving model to model_rnn_fold_2.hdf5
Epoch 5/15
Epoch 5: val_accuracy improved from 0.98161 to 0.98241, saving model to model_rnn_fold_2.hdf5
Epoch 6/15
Epoch 6: val_accuracy improved from 0.98241 to 0.98961, saving model to model_rnn_fold_2.hdf5
Epoch 7/15
Epoch 7: val_accuracy did not improve from 0.98961
Epoch 8/15
Epoch 8: val_accuracy did not improve from 0.98961
Epoch 9/15
Epoch 9: val_accuracy did not improve from 0.98961

Fold Results:
              precision    recall  f1-score   support

           A       0.99      1.00      1.00       104
          FR       1.00      0.84      0.91       104
          FT       1.00      1.00      1.00       104
           L       1.00      1.00      1.00       104
          LF  

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.82414 to 0.94085, saving model to model_rnn_fold_3.hdf5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.94085 to 0.95923, saving model to model_rnn_fold_3.hdf5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.95923 to 0.98002, saving model to model_rnn_fold_3.hdf5
Epoch 5/15
Epoch 5: val_accuracy improved from 0.98002 to 0.98481, saving model to model_rnn_fold_3.hdf5
Epoch 6/15
Epoch 6: val_accuracy improved from 0.98481 to 0.99041, saving model to model_rnn_fold_3.hdf5
Epoch 7/15
Epoch 7: val_accuracy did not improve from 0.99041
Epoch 8/15
Epoch 8: val_accuracy did not improve from 0.99041
Epoch 9/15
Epoch 9: val_accuracy improved from 0.99041 to 0.99121, saving model to model_rnn_fold_3.hdf5
Epoch 10/15
Epoch 10: val_accuracy did not improve from 0.99121
Epoch 11/15
Epoch 11: val_accuracy improved from 0.99121 to 0.99440, saving model to model_rnn_fold_3.hdf5
Epoch 12/15
Epoch 12: val_accuracy did not improve from 0.99440
Epoch 13/15
Epoch 13: 

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.74021 to 0.94005, saving model to model_rnn_fold_4.hdf5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.94005 to 0.94884, saving model to model_rnn_fold_4.hdf5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.94884 to 0.97842, saving model to model_rnn_fold_4.hdf5
Epoch 5/15
Epoch 5: val_accuracy did not improve from 0.97842
Epoch 6/15
Epoch 6: val_accuracy improved from 0.97842 to 0.98241, saving model to model_rnn_fold_4.hdf5
Epoch 7/15
Epoch 7: val_accuracy improved from 0.98241 to 0.98481, saving model to model_rnn_fold_4.hdf5
Epoch 8/15
Epoch 8: val_accuracy improved from 0.98481 to 0.98561, saving model to model_rnn_fold_4.hdf5
Epoch 9/15
Epoch 9: val_accuracy improved from 0.98561 to 0.98801, saving model to model_rnn_fold_4.hdf5
Epoch 10/15
Epoch 10: val_accuracy improved from 0.98801 to 0.98961, saving model to model_rnn_fold_4.hdf5
Epoch 11/15
Epoch 11: val_accuracy improved from 0.98961 to 0.99281, saving model to model_rnn_fold_4.hdf5


  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.83613 to 0.92886, saving model to model_rnn_fold_5.hdf5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.92886 to 0.97522, saving model to model_rnn_fold_5.hdf5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.97522 to 0.98161, saving model to model_rnn_fold_5.hdf5
Epoch 5/15
Epoch 5: val_accuracy improved from 0.98161 to 0.98641, saving model to model_rnn_fold_5.hdf5
Epoch 6/15
Epoch 6: val_accuracy did not improve from 0.98641
Epoch 7/15
Epoch 7: val_accuracy did not improve from 0.98641
Epoch 8/15
Epoch 8: val_accuracy did not improve from 0.98641

Fold Results:
              precision    recall  f1-score   support

           A       1.00      1.00      1.00       104
          FR       0.97      0.87      0.91       104
          FT       1.00      1.00      1.00       104
           L       1.00      1.00      1.00       104
          LF       0.99      1.00      1.00       105
          MN       1.00      1.00      1.00       105
           

  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.81135 to 0.93685, saving model to model_rnn_fold_6.hdf5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.93685 to 0.95763, saving model to model_rnn_fold_6.hdf5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.95763 to 0.98241, saving model to model_rnn_fold_6.hdf5
Epoch 5/15
Epoch 5: val_accuracy did not improve from 0.98241
Epoch 6/15
Epoch 6: val_accuracy did not improve from 0.98241
Epoch 7/15
Epoch 7: val_accuracy improved from 0.98241 to 0.98481, saving model to model_rnn_fold_6.hdf5
Epoch 8/15
Epoch 8: val_accuracy did not improve from 0.98481
Epoch 9/15
Epoch 9: val_accuracy improved from 0.98481 to 0.98801, saving model to model_rnn_fold_6.hdf5
Epoch 10/15
Epoch 10: val_accuracy improved from 0.98801 to 0.99121, saving model to model_rnn_fold_6.hdf5
Epoch 11/15
Epoch 11: val_accuracy did not improve from 0.99121
Epoch 12/15
Epoch 12: val_accuracy did not improve from 0.99121
Epoch 13/15
Epoch 13: val_accuracy did not improve from 0.99121



  saving_api.save_model(


Epoch 2: val_accuracy improved from 0.84093 to 0.95204, saving model to model_rnn_fold_7.hdf5
Epoch 3/15
Epoch 3: val_accuracy improved from 0.95204 to 0.95683, saving model to model_rnn_fold_7.hdf5
Epoch 4/15
Epoch 4: val_accuracy improved from 0.95683 to 0.98082, saving model to model_rnn_fold_7.hdf5
Epoch 5/15