In [1]:
import os
import glob
import pandas as pd

def break_into_paragraphs(filepath, max_length=3000):
    """Reads a text file, cleans it, and breaks it into paragraphs of approximately max_length characters."""
    with open(filepath, 'r', encoding='utf-8') as file:
        text = file.read()
    
    # Remove newline characters
    text = text.replace('\n', ' ')

    # Remove unnecessary backslashes
    text = text.replace('\\', '')

    # Remove excessive spaces by splitting and rejoining the text
    text = ' '.join(text.split())
        
    paragraphs = []
    current_paragraph = ""
    
    # Split the text into sentences using a simple split by period followed by space
    sentences = text.split('. ')
    for sentence in sentences:
        if len(current_paragraph) + len(sentence) < max_length:
            current_paragraph += sentence + '. '
        else:
            # If the current paragraph is full, add it to the list and start a new one
            paragraphs.append(current_paragraph)
            current_paragraph = sentence + '. '
    
    # Add the last paragraph if it contains any text
    if current_paragraph:
        paragraphs.append(current_paragraph)
    
    return paragraphs

# Specify the path to the directory containing the text files
directory_path = 'C:\CSCI2470_Final_Project_AutherDetective\Books'

# Create an empty DataFrame
book_paragraphs_df = pd.DataFrame(columns=['text', 'book_name'])

# Iterate over each text file in the directory
for filepath in glob.glob(os.path.join(directory_path, '*.txt')):
    book_name = os.path.basename(filepath).replace('.txt', '')
    print(f'Processing {filepath}')
    paragraphs = break_into_paragraphs(filepath)

    # Create a DataFrame from paragraphs
    temp_df = pd.DataFrame({'text': paragraphs, 'book_name': [book_name] * len(paragraphs)})
    book_paragraphs_df = pd.concat([book_paragraphs_df, temp_df], ignore_index=True)

# Display the DataFrame structure
print(book_paragraphs_df.head())


Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Anna_Kerinena.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Anne_of_Avonlea.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Anne_of_Green_Gables.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Around_the_World_in_Eighty_Days.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\At_the_Mountains_of_Madness.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\A_General_History_of_the_Pyrates.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\A_Little_Princess.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\A_Tale_of_Two_Cities.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Dracula.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Emma.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Frankenstein.txt
Processing C:\CSCI2470_Final_Project_AutherDetective\Books\Gone_with_the_Wind.txt
Processing C:\CS

In [2]:
book_paragraphs_df

Unnamed: 0,text,book_name
0,Happy families are all alike; every unhappy fa...,Anna_Kerinena
1,"""Ah, ah, ah! Oo!..."" he muttered, recalling ev...",Anna_Kerinena
2,But he felt all the difficulty of his position...,Anna_Kerinena
3,"Stepan Arkadyevitch's eyes asked: ""Why do you ...",Anna_Kerinena
4,"Beg her forgiveness, sir. There's no help for ...",Anna_Kerinena
...,...,...
13149,"He returned after midnight, and, instead of go...",Wuthering_Heights
13150,“Strange happiness! If you would hear me witho...,Wuthering_Heights
13151,"The following evening was very wet: indeed, it...",Wuthering_Heights
13152,"Hareton, with a streaming face, dug green sods...",Wuthering_Heights


In [3]:
book_paragraphs_df['book_name'].unique()

array(['Anna_Kerinena', 'Anne_of_Avonlea', 'Anne_of_Green_Gables',
       'Around_the_World_in_Eighty_Days', 'At_the_Mountains_of_Madness',
       'A_General_History_of_the_Pyrates', 'A_Little_Princess',
       'A_Tale_of_Two_Cities', 'Dracula', 'Emma', 'Frankenstein',
       'Gone_with_the_Wind', 'Great_Expectations',
       "Harry_Potter_and_the_Sorcerer's_Stone",
       'Hound_of_the_Baskervilles', 'Huckleberry_Finn', 'Jane_Eyre',
       'Kidnapped', 'Les_Misérables', 'Little_Women', 'lovecraft',
       'Moby_Dick', 'Pride_and_Prejudice', 'Sign_of_the_Four',
       'Study_in_Scarlet', 'Tender_is_the_Night', 'The_Black_Arrow',
       'The_Call_of_Cthulhu', 'The_Case_of_Charles_Dexter_Ward',
       'The_Chamber_Of_Secrets', 'The_Dunwich_Horror',
       'The_Fortunes_and_Misfortunes_of_the_Famous_Moll_Flanders',
       'The_Great_Gatsby', 'The_Haunter_of_the_Dark',
       'The_History_of_the_Devil', 'The_Last_Man',
       'The_Life_and_Adventures_of_Robinson_Crusoe',
       'The_Murder

In [4]:
books_authors = {
    "Anna_Kerinena": "Leo Tolstoy",
    "Anne_of_Avonlea": "Montgomery",
    "Anne_of_Green_Gables": "Montgomery",
    "Around_the_World_in_Eighty_Days": "Jules Verne",
    "At_the_Mountains_of_Madness": "H.P. Lovecraft",
    "A_General_History_of_the_Pyrates": "Daniel Defoe",
    "A_Little_Princess": "Frances Hodgson Burnett",
    "A_Tale_of_Two_Cities": "Dickens",
    "Don_Quixote": "Cervantes",
    "Dracula": "Stoker",
    "Emma": "Austen",
    "Frankenstein": "Shelley",
    "Gone_with_the_Wind": "Mitchell",
    "Great_Expectations": "Dickens",
    "Harry_Potter_and_the_Sorcerer's_Stone": "J.K. Rowling",
    "Hound_of_the_Baskervilles": "Arthur Conan Doyle",
    "Huckleberry_Finn": "Mark Twain",
    "Jane_Eyre": "Emily Bronte",
    "Kidnapped": "Robert Louis Stevenson",
    "Les_Misérables": "Victor Hugo",
    "Little_Women": "Louisa May Alcott",
    "lovecraft": "H.P. Lovecraft",
    "Moby_Dick": "Herman Melville",
    "Pride_and_Prejudice": "Austen",
    "Sign_of_the_Four": "Arthur Conan Doyle",
    "Study_in_Scarlet": "Arthur Conan Doyle",
    "Tender_is_the_Night": "Fitzgerald",
    "The_Black_Arrow": "Robert Louis Stevenson",
    "The_Call_of_Cthulhu": "H.P. Lovecraft",
    "The_Case_of_Charles_Dexter_Ward": "H.P. Lovecraft",
    "The_Chamber_Of_Secrets": "J.K. Rowling",
    "The_Dunwich_Horror": "H.P. Lovecraft",
    "The_Fortunes_and_Misfortunes_of_the_Famous_Moll_Flanders": "Daniel Defoe",
    "The_Great_Gatsby": "Fitzgerald",
    "The_Haunter_of_the_Dark": "H.P. Lovecraft",
    "The_Haunting_of_Hill_House": "Shirley Jackson",
    "The_History_of_the_Devil": "Daniel Defoe",
    "The_Last_Man": "Shelley",
    "The_Life_and_Adventures_of_Robinson_Crusoe": "Daniel Defoe",
    "The_Monkey's_Paw": "a",
    "The_Murder_on_the_Links": "Agatha Christie",
    "The_Mysterious_Affair_at_Styles": "Agatha Christie",
    "The_Mystery_of_the_Yellow_Room": "Gaston Leroux",
    "The_Phantom_of_the_Opera": "Gaston Leroux",
    "The_Secret_Garden": "Frances Hodgson Burnett",
    "The_Secret_of_the_Night": "Gaston Leroux",
    "The_shadow_over_Innsmouth": "H.P. Lovecraft",
    "The_Strange_Case_of_Dr._Jekyll_and_Mr._Hyde": "Robert Louis Stevenson",
    "The_Thing_on_the_Door-Step": "H.P. Lovecraft",
    "The_Trap": "b",
    "This_Side_of_Paradise": "Fitzgerald",
    "Tom_Sawyer": "Mark Twain",
    "Treasure_Island": "Robert Louis Stevenson",
    "Twenty_Thousand_Leagues_under_the_Sea": "Jules Verne",
    "War_and_Peace": "Leo Tolstoy",
    "We_Have_Always_Lived_In_The_Castle": "Shirley Jackson",
    "Woman_in_White": "Wilkie Collins",
    "Wuthering_Heights": "Emily Bronte"
}

In [5]:
book_paragraphs_df

Unnamed: 0,text,book_name
0,Happy families are all alike; every unhappy fa...,Anna_Kerinena
1,"""Ah, ah, ah! Oo!..."" he muttered, recalling ev...",Anna_Kerinena
2,But he felt all the difficulty of his position...,Anna_Kerinena
3,"Stepan Arkadyevitch's eyes asked: ""Why do you ...",Anna_Kerinena
4,"Beg her forgiveness, sir. There's no help for ...",Anna_Kerinena
...,...,...
13149,"He returned after midnight, and, instead of go...",Wuthering_Heights
13150,“Strange happiness! If you would hear me witho...,Wuthering_Heights
13151,"The following evening was very wet: indeed, it...",Wuthering_Heights
13152,"Hareton, with a streaming face, dug green sods...",Wuthering_Heights


In [6]:
book_paragraphs_df['author'] = book_paragraphs_df['book_name'].map(books_authors)
book_paragraphs_df

Unnamed: 0,text,book_name,author
0,Happy families are all alike; every unhappy fa...,Anna_Kerinena,Leo Tolstoy
1,"""Ah, ah, ah! Oo!..."" he muttered, recalling ev...",Anna_Kerinena,Leo Tolstoy
2,But he felt all the difficulty of his position...,Anna_Kerinena,Leo Tolstoy
3,"Stepan Arkadyevitch's eyes asked: ""Why do you ...",Anna_Kerinena,Leo Tolstoy
4,"Beg her forgiveness, sir. There's no help for ...",Anna_Kerinena,Leo Tolstoy
...,...,...,...
13149,"He returned after midnight, and, instead of go...",Wuthering_Heights,Emily Bronte
13150,“Strange happiness! If you would hear me witho...,Wuthering_Heights,Emily Bronte
13151,"The following evening was very wet: indeed, it...",Wuthering_Heights,Emily Bronte
13152,"Hareton, with a streaming face, dug green sods...",Wuthering_Heights,Emily Bronte


In [7]:
book_paragraphs_df['author'].unique()

array(['Leo Tolstoy', 'Montgomery', 'Jules Verne', 'H.P. Lovecraft',
       'Daniel Defoe', 'Frances Hodgson Burnett', 'Dickens', 'Stoker',
       'Austen', 'Shelley', 'Mitchell', 'J.K. Rowling',
       'Arthur Conan Doyle', 'Mark Twain', 'Emily Bronte',
       'Robert Louis Stevenson', 'Victor Hugo', 'Louisa May Alcott',
       'Herman Melville', 'Fitzgerald', 'Agatha Christie',
       'Gaston Leroux', 'Wilkie Collins'], dtype=object)

In [8]:
def sample_or_all(group):
    n = min(len(group), 100)  # Choose 100 or the group size if less than 100
    return group.sample(n, replace=False)  # Randomly sample without replacement

# Apply the function to each group
df_subset = book_paragraphs_df.groupby('author').apply(sample_or_all).reset_index(drop=True)

df_subset

Unnamed: 0,text,book_name,author
0,“Did you know all the time that it was—the oth...,The_Murder_on_the_Links,Agatha Christie
1,Read for yourself.” The letter was written on ...,The_Murder_on_the_Links,Agatha Christie
2,Monsieur Beroldy was a junior partner in a fir...,The_Murder_on_the_Links,Agatha Christie
3,"I rather fancy—” “Yes?” “I may be mistaken, bu...",The_Murder_on_the_Links,Agatha Christie
4,“(3) Marthe Daubreuil was the daughter of the ...,The_Murder_on_the_Links,Agatha Christie
...,...,...,...
2295,"Hartright?"" asked Miss Halcombe, with her eyes...",Woman_in_White,Wilkie Collins
2296,"There was serious weight in this objection, an...",Woman_in_White,Wilkie Collins
2297,I saw that my best chance of winning her confi...,Woman_in_White,Wilkie Collins
2298,"""What do you see there to laugh at?"" I asked, ...",Woman_in_White,Wilkie Collins


In [9]:
from sklearn.model_selection import train_test_split

In [10]:
train_df, test_df = train_test_split(df_subset, test_size=0.3, stratify=df_subset['author'], random_state=42)

In [11]:
# Saving the train_df and test_df to CSV files with corrected paths
train_df.to_csv(r'C:\Deep-Learning-based-Authorship-Identification\data\train_data.csv', index=False)
test_df.to_csv(r'C:\Deep-Learning-based-Authorship-Identification\data\test_data.csv', index=False)


In [12]:
df_subset.where

<bound method DataFrame.where of                                                    text  \
0     “Did you know all the time that it was—the oth...   
1     Read for yourself.” The letter was written on ...   
2     Monsieur Beroldy was a junior partner in a fir...   
3     I rather fancy—” “Yes?” “I may be mistaken, bu...   
4     “(3) Marthe Daubreuil was the daughter of the ...   
...                                                 ...   
2295  Hartright?" asked Miss Halcombe, with her eyes...   
2296  There was serious weight in this objection, an...   
2297  I saw that my best chance of winning her confi...   
2298  "What do you see there to laugh at?" I asked, ...   
2299  This done, I reminded my audience of the date ...   

                    book_name           author  
0     The_Murder_on_the_Links  Agatha Christie  
1     The_Murder_on_the_Links  Agatha Christie  
2     The_Murder_on_the_Links  Agatha Christie  
3     The_Murder_on_the_Links  Agatha Christie  
4     The_Mur

In [13]:
df_subset[df_subset['author'] == 'Shirley Jackson']

Unnamed: 0,text,book_name,author
