## Dataset Overview

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
import pandas as pd

In [3]:
df=pd.read_csv("/content/drive/MyDrive/MiniProject/overview-of-recordings.csv")

In [4]:
df_text = df[['phrase', 'prompt']]
df_text

Unnamed: 0,phrase,prompt
0,When I remember her I feel down,Emotional pain
1,When I carry heavy things I feel like breaking...,Hair falling out
2,there is too much pain when i move my arm,Heart hurts
3,My son had his lip pierced and it is swollen a...,Infected wound
4,My muscles in my lower back are aching,Infected wound
...,...,...
6656,I feel a burning sensation in my guts about 2 ...,Stomach ache
6657,I have a split on my thumb that will not heal.,Open wound
6658,I feel a lot of pain in the joints.,Joint pain
6659,The area around my heart doesn't feel good.,Heart hurts


In [5]:
# Check the distribution of phrases and prompts
print("Unique phrases:", df['phrase'].nunique())
print("Unique prompts:", df['prompt'].nunique())

Unique phrases: 706
Unique prompts: 25


## Pandas profiling

In [6]:
#!pip install https://github.com/pandas-profiling/pandas-profiling/archive/master.zip

In [7]:
from pandas_profiling import ProfileReport
from ydata_profiling import ProfileReport
profile=ProfileReport(df)
profile.to_file("pandas_profile.html")

  from pandas_profiling import ProfileReport


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## NLP EDA

In [None]:
!pip install nltk



In [None]:
import nltk
nltk.download('wordnet')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import string

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Function to preprocess text
def preprocess_text(text):
    # Convert text to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))

    # Tokenize text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatize tokens
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Join tokens back into a string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

In [None]:
# Apply preprocessing to 'phrase' and 'prompt' columns
df['clean_phrase'] = df['phrase'].apply(preprocess_text)
df['clean_prompt'] = df['prompt'].apply(preprocess_text)

## Applying LSTM on initial preprocessed data

>Import ANN Libraries

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

In [None]:
# Example dataset (replace it with your actual dataset)
phrases = df['phrase'].tolist()
prompts = df['prompt'].tolist()

# Tokenize text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(phrases)
X = tokenizer.texts_to_sequences(phrases)

# Padding sequences
max_length = max([len(seq) for seq in X])
print("Max_length=",max_length)
X_padded = pad_sequences(X, maxlen=max_length, padding='post')

# Convert labels to one-hot encoding
label_to_index = {label: i for i, label in enumerate(set(prompts))}
y = [label_to_index[label] for label in prompts]
y_one_hot = np.zeros((len(y), len(label_to_index)))
for i, label_index in enumerate(y):
    y_one_hot[i, label_index] = 1

Max_length= 30


>Training model using LSTM

In [None]:
from keras.callbacks import EarlyStopping

# Split dataset into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_one_hot, test_size=0.2, random_state=42)

# Define model architecture
embedding_dim = 100
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_length))
model.add(LSTM(units=128))
model.add(Dense(units=len(label_to_index), activation='softmax'))

# Compile model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Define early stopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train model with early stopping
history = model.fit(X_train, y_train, epochs=25, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Test Loss: {loss}, Test Accuracy: {accuracy}')


Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Test Loss: 0.12321972101926804, Test Accuracy: 0.9759939908981323
