# NECESSARY IMPORTS

In [None]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from keras.models import Sequential, load_model
from keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, 
    precision_score, 
    recall_score, 
    f1_score, 
    confusion_matrix, 
    roc_curve, 
    auc
)
import seaborn as sns
import matplotlib.pyplot as plt
from gensim.models import Word2Vec

# DATA COLLECTION

In [None]:
def clean_text(text):
    cleaned_text = text.replace('\\', ' ').replace('\n', ' ')
    return cleaned_text.strip()

def extract_text_by_chapter(pdf_path):
    chapters = {}
    current_chapter = ""

    for page_layout in extract_pages(pdf_path):
        for element in page_layout:
            if isinstance(element, LTTextContainer):
                text = element.get_text().strip()
                cleaned_text = clean_text(text) 
                if cleaned_text.isupper() and len(cleaned_text.split()) > 1: 
                    current_chapter = cleaned_text
                    chapters[current_chapter] = []
                elif current_chapter:
                 
                    sentences = re.split(r'(?<=[.!?]) +', cleaned_text)
                    chapters[current_chapter].extend(sentences)

    return chapters
pdf_path =r"Sigmund-Freud-The-Complete-Works.pdf"
chapters = extract_text_by_chapter(pdf_path)

structured_data = {"Sentences": chapters}

json_data = json.dumps(structured_data, indent=4)  


with open('Uncleaned_text.json', 'w') as file:
    file.write(json_data)

# DATA CLEANING

In [None]:
with open("Cleaned_data.json") as file:
    train_data = json.load(file)

dict_data = {
    "sentence": [],
    "class": []
}

for book, chapters in train_data.items():
    for chapter, sentences in chapters.items():
        for sentence in sentences:
            dict_data["sentence"].append(sentence)
            dict_data["class"].append(1)  

train_df = pd.DataFrame(dict_data)
print(train_df.head())
train_df.to_csv("dataset.csv")

def drop_rows_with_integer_in_first_column(csv_file):
    df = pd.read_csv(csv_file)
    first_column = df.iloc[:, 1]
    def is_integer(value):
        try:
            int(value)
            return True
        except ValueError:
            return False

    non_integer_mask = ~first_column.apply(is_integer)
    filtered_df = df[non_integer_mask]

    return filtered_df

filtered_df = drop_rows_with_integer_in_first_column("dataset.csv")
no_df = pd.read_csv("tweet_emotions.csv")

no_df["class"] = 0

no_df = no_df[["content", "class"]]

no_df.rename(columns={'content': 'sentence'}, inplace=True)

final_df = pd.concat([filtered_df, no_df]).reset_index(drop=True)

final_df.to_csv("final_dataset.csv")

# READING DATA

In [None]:
SIZE_OF_DATA = 0.33
EPOCHS = 10
BATCH_SIZE = 32
CLASSIFICATION_THRESHOLD = 0.5

In [None]:
df = pd.read_csv(r"Final_dataset.csv")
df

In [None]:

df = df.sample(frac=SIZE_OF_DATA).reset_index(drop=True)
len(df)


# Computing Raw Setnence Embeddings

## SBERT with all-MiniLM-L6-v2

In [None]:
sbert_encoder = SentenceTransformer('all-MiniLM-L6-v2')
sbert_embeddings = sbert_encoder.encode(df["sentence"])
sbert_embeddings.shape
np.save("sbert_Sentence_embeddings.npy", sbert_embeddings)

In [None]:
assert len(sbert_embeddings) == len(labels)

## Word2Vec

In [None]:
df['tokenized_sentences'] = df['sentence'].apply(lambda x: x.split()if not isinstance(x, float) else [] )

data = df['tokenized_sentences'].tolist()
model = Word2Vec(data, window=5, min_count=1, workers=4)
def sentence_embedding(sentence):
    # Check if the sentence is a float or int, and if so, return a zero vector
    if isinstance(sentence, (float, int)):
        return np.zeros(384)
    words = sentence.split()
    word_embeddings = [model.wv[word] for word in words if word in model.wv]
    if len(word_embeddings) == 0:
        return np.zeros(384)
    # Averaging the word vectors to create a sentence vector
    sentence_embedding = np.mean(word_embeddings, axis=0)
    return sentence_embedding

# Alignment Classification with Neural Networks

In [None]:
labels = np.array(list(df["class"]))
# Change parameter to test different embeddings
X_train, X_test, y_train, y_test = train_test_split(sbert_embeddings, labels, test_size=0.2, random_state=42)
X_train = np.array(X_train.tolist())
X_test = np.array(X_test.tolist())

# Now, print the shapes again to confirm they are correctly formatted
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print(type(X_train))
print(type(X_test))
print(type(y_train))
print(type(y_test))
print(X_train.shape)
print(X_test.shape)
print(len(y_train))
print(len(y_test))
y_train = np.array(y_train)
y_test = np.array(y_test)

In [None]:
input_dim = 384  # Size of embeddings

model = Sequential()
model.add(Dense(128, input_dim=input_dim, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

In [None]:
model.summary()

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=EPOCHS, batch_size=BATCH_SIZE)

# Evaluation & Metrics

In [None]:
y_pred = model.predict(X_test)
y_pred_bin = (y_pred > CLASSIFICATION_THRESHOLD).astype(int)

accuracy = accuracy_score(y_test, y_pred_bin)
precision = precision_score(y_test, y_pred_bin, average='macro')
recall = recall_score(y_test, y_pred_bin, average='macro')
f1 = f1_score(y_test, y_pred_bin, average='macro')
conf_matrix = confusion_matrix(y_test, y_pred_bin)

print('Accuracy: %.3f' % accuracy)
print('Precision: %.3f' % precision)
print('Recall: %.3f' % recall)
print('F1 Score: %.3f' % f1)
print('Confusion Matrix:')

  

In [None]:

def draw_conf(conf_matrix):
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="magma", cbar=False,
                xticklabels=['Predicted Negative', 'Predicted Positive'],
                yticklabels=['Actual Negative', 'Actual Positive'])
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.title('Confusion Matrix')
    plt.show()


draw_conf(conf_matrix)

In [None]:

def draw_roc(y_test, y_pred):
    fpr, tpr, _ = roc_curve(y_test, y_pred)
    roc_auc = auc(fpr, tpr)

    plt.figure()
    plt.plot(fpr, tpr, color='darkorange', lw=2, label='ROC curve (area = %0.2f)' % roc_auc)
    plt.plot([0, 1], [0, 1], color='purple', lw=2, linestyle='--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()  

    draw_roc(y_test, y_pred)