# Handle 'hobby' data

### First of all, have a look at the data

In [1]:
import re
import os

import pandas as pd


PATH = "test.txt"
# PATH = "C:/Users/San/Documents/inf/time monitoring/book, movies, and other stuff I completed by year/2023 - What I read, watched and played.txt"
# PATH = "C:/Users/San/Documents/inf/time monitoring/book, movies, and other stuff I completed by year/2022 - What I read, watched and played.txt"
# PATH = "C:/Users/San/Documents/inf/time monitoring/book, movies, and other stuff I completed by year/2021 - What I read, watched and played.txt"
# Read the text file with UTF-8 encoding
with open(PATH, "r", encoding="utf-8") as file:
    content = file.read()
print(content)

Books:
1. "The Republic of Thieves: Gentleman Bastards" by Scott Lynch | audiobook on January 5, 2023
2. "Hyperion" by Dan Simmons | audiobook on January 13, 2023
3. "Propaganda" by Edward Bernays | audiobook on Jan 17, 2023
4. "The Fall of Hyperion" by Dan Simmons | audiobook on Jan 23rd, 2023
5. "Джури-характерники" by Володимир Рутківський on January 23, 2023
6. "Zen in the Art of Writing" by Ray Bradbury | audiobook on January 25, 2023
7. "The Final Empire: Mistborn, Book 1" by Brandon Sanderson | audiobook on February 2, 2023
8. "The Well of Ascension: Mistborn, Book 2" by Brandon Sanderson | audiobook on February 4, 2023
9. "The Hero of Ages: Mistborn: Book 3" by Brandon Sanderson | audiobook on Feb 6, 2023
10. "Factotum" by Charles Bukowski | audiobook on February 9, 2023
11. "Ham on Rye" by Charles Bukowski | audiobook on February 12, 2023
12. "Warbreaker" by Brandon Sanderson on February 13, 2023
13. "Hollywood" by Charles Bukowski | audiobook on February 16, 2023
14. "Pulp" b

### Next, use regular expressions to extract movies and games data

Get movie df

In [2]:
# Find the 'Movies:' section using the regular expression
movies_section = (
    re.search(r"Movies:(.*?)(?=\n\n|$)", content, re.DOTALL).group(1).strip()
)
# Define a regular expression pattern for extracting movie information
movies_pattern = re.compile(
    r"\d+\. ([^\n]+) on ([A-Za-z]+) (\d{1,2})(?:st|nd|rd|th)?, (\d{4})"
)
# Extract movie information using the pattern
movies_data = movies_pattern.findall(movies_section)
cols = ["Name", "Month", "Day", "Year"]
# Create a pandas DataFrame for movies
movies_df = pd.DataFrame(movies_data, columns=cols)
# Shorten the month to the first three letters
movies_df["Month"] = movies_df["Month"].apply(lambda month: month[:3])
movies_df

Unnamed: 0,Name,Month,Day,Year
0,Romancing the Stone,Jan,6,2023
1,Star Trek: The Wrath of Khan,Jan,15,2023
2,Greyhound,Jan,16,2023
3,The Thing,Jan,19,2023
4,"Lock, Stock and Two Smoking Barrels",Jan,6,2022
5,Revolver,Jan,6,2022
6,Wrath of Man,Jan,8,2022


Get games df

In [3]:
# Find the 'Games:' section using the regular expression
games_section = re.search(r"Games:(.*?)$", content, re.DOTALL).group(1).strip()
# Define a regular expression pattern for extracting game information
games_pattern = re.compile(
    r"\d+\. ([^\n]+) on ([A-Za-z]+) (\d{1,2})(?:st|nd|rd|th)?, (\d{4}) \((\d+) hours\)"
)
# Extract game information using the pattern
games_data = games_pattern.findall(games_section)
# 'PTH' stands for 'Play Time in Hours'a
cols = ["Name", "Month", "Day", "Year", "PTH"]
# Create a pandas DataFrame for games
games_df = pd.DataFrame(games_data, columns=cols)
# Shorten the month to the first three letters
games_df["Month"] = games_df["Month"].apply(lambda month: month[:3])
games_df

Unnamed: 0,Name,Month,Day,Year,PTH
0,Marvel's Guardians of the Galaxy,Jan,15,2023,25
1,Titanfall 2,Feb,19,2023,6
2,Xcom2 WotC,Feb,28,2022,50
3,Ghostrunner,Mar,15,2022,11
4,Star Wars Jedi Fallen Order,Apr,6,2022,22
5,Kingdom New Lands,Dec,29,2021,14


### After receiving experience in such expressions, let's write a function that extracts more complex book data

In [4]:
def get_books_df(content):
    """
    Summary:
        Extracts book information from a provided text content and creates a DataFrame.
    Args:
        content (str): The text content containing book information.
    Returns:
        pandas.DataFrame: A DataFrame containing book details with columns:
            'Name', 'Author', 'Audiobook', 'Month', 'Day', and 'Year'.
    What function does in more detail:
        The function extracts book information from the provided content, including
        book titles, authors, audiobook status, and completion date. It parses the
        content using regular expressions and processes each line to create a structured
        DataFrame for further analysis.

        Columns of the df and what they mean:
        - 'Name': The title of the book
        - 'Author': The author's name
        - 'Audiobook': A boolean indicating whether it's an audiobook (True) or not (False)
        - 'Month': The completion month (abbreviated to the first three letters)
        - 'Day': The completion day
        - 'Year': The completion year
        The last three cols ('Month', 'Day', and 'Year') show when you finished reading the book
    """
    # Find the 'Books:' section using string manipulation
    books_section = (
        re.search(r"Books:(.*?)(?=Movies:|Games:)", content, re.DOTALL).group(1).strip()
    )
    # Split the section into lines
    books_lines = books_section.split("\n")
    # Remove all data before the first double quote in each line
    books_lines = [line.split('"', 1)[1] for line in books_lines]
    # Initialize variables to store book data
    books_data = []
    audiobook_marker = "| audiobook"
    # Process each line in the book section
    for line in books_lines:
        cur_book = {}
        # Audiobook entry
        if audiobook_marker in line:
            # Split the data on book data (book name and author)
            # and date (Month, day, and year)
            name_author, date = line.split(audiobook_marker)
            name, author = name_author.split('" by ')
            # Use 'strip()' to remove trailing whitespaces
            cur_book["Name"] = name.strip()
            cur_book["Author"] = author.strip()
            cur_book["Audiobook"] = True
        # Regular book entry
        else:
            # Handle the case when there's ' on ' in book name
            # e.g., 'Essays on the Theory of Numbers'
            try:
                name_author_pair, date = line.split(" on ")
            except ValueError:
                line_list = line.split(" on ")
                name_author_pair = " on ".join(line_list[:-1])
                date = line_list[-1]
            name, author = name_author_pair.split('" by ')
            # Use 'strip()' to remove trailing whitespaces
            cur_book["Name"] = name.strip()
            cur_book["Author"] = author.strip()
            cur_book["Audiobook"] = False
        # Remove trailing whitespace before the string
        # Otherwise, regex won't work
        date = date.strip()
        # Extract date info using regex
        match = re.match(
            r"(?:on )?([A-Za-z]+) (\d{1,2})(?:st|nd|rd|th)?, (\d{4})", date
        )
        cur_book["Month"] = match.group(1)[:3]
        cur_book["Day"] = int(match.group(2))
        cur_book["Year"] = int(match.group(3))
        books_data.append(cur_book)
    # Create a pandas DataFrame for books and return it
    return pd.DataFrame(
        books_data, columns=["Name", "Author", "Audiobook", "Month", "Day", "Year"]
    )


books_df = get_books_df(content)
books_df.head(10)

Unnamed: 0,Name,Author,Audiobook,Month,Day,Year
0,The Republic of Thieves: Gentleman Bastards,Scott Lynch,True,Jan,5,2023
1,Hyperion,Dan Simmons,True,Jan,13,2023
2,Propaganda,Edward Bernays,True,Jan,17,2023
3,The Fall of Hyperion,Dan Simmons,True,Jan,23,2023
4,Джури-характерники,Володимир Рутківський,False,Jan,23,2023
5,Zen in the Art of Writing,Ray Bradbury,True,Jan,25,2023
6,"The Final Empire: Mistborn, Book 1",Brandon Sanderson,True,Feb,2,2023
7,"The Well of Ascension: Mistborn, Book 2",Brandon Sanderson,True,Feb,4,2023
8,The Hero of Ages: Mistborn: Book 3,Brandon Sanderson,True,Feb,6,2023
9,Factotum,Charles Bukowski,True,Feb,9,2023


### Write a function that goes through every file in the folder and puts book data into one df

In [5]:
def process_files_in_folder(folder_path):
    """
    Summary:
        Processes text files in a folder, extracts book information,
        and creates a combined DataFrame.
    Args:
        folder_path (str):
            The path to the folder containing text files
            (of the same structure!) with book information.
    Returns:
        pandas.DataFrame:
            A combined DataFrame containing book details
            from all files with columns:
                'Name', 'Author', 'Audiobook', 'Month', 'Day', and 'Year'.
    """
    # Create an empty list to store DataFrames from each file
    data_frames = []
    # Iterate through all files in the folder
    for filename in os.listdir(folder_path):
        if filename.endswith(".txt"):
            file_path = os.path.join(folder_path, filename)
            # Read the content of the file
            with open(file_path, "r", encoding="utf-8") as file:
                content = file.read()
            # Apply the get_books_df function to the file's content
            books_df = get_books_df(content)
            # Append the resulting DataFrame to the list
            data_frames.append(books_df)
    # Concatenate all DataFrames into a single DataFrame and return it
    return pd.concat(data_frames, ignore_index=True)


FOLDER_PATH = "C:/Users/San/Documents/inf/time monitoring/book, movies, and other stuff I completed by year/"
all_books_df = process_files_in_folder(FOLDER_PATH)

In [6]:
all_books_df

Unnamed: 0,Name,Author,Audiobook,Month,Day,Year
0,The Prince,Nicolo Machiavelli,False,Jan,4,2021
1,Descriptions of the methods adopted by the...,Nicolo Machiavelli,False,Jan,4,2021
2,The life of Castruccio Castracani of Lucca,Nicolo Machiavelli,False,Jan,4,2021
3,Alice's Adventures in Wonderland,Lewis Carroll,False,Jan,10,2021
4,Statistics Done Wrong,Alex Reinhart,False,Jan,13,2021
...,...,...,...,...,...,...
300,The Blue Carbuncle,Arthur Conan Doyle,True,Dec,4,2023
301,Огненне коло. Повість про трагедію під Бродами,Іван Багряний,False,Dec,7,2023
302,A Christmas Carol,Charles Dickens,True,Dec,18,2023
303,Лісова Пісня,Леся Українка,False,Dec,19,2023


### Save the data into a .cvv file that you'll later use in Books Dashboard

In [7]:
# all_books_df.to_csv("books_data.csv")

### Explore the data a little

In [8]:
all_books_df["Author"].value_counts()

Joe Abercrombie                           33
Robert E. Howard                          18
Seneca                                    11
Roald Dahl                                10
J.R.R. Tolkien                             9
                                          ..
Kahlil Gibran                              1
Andriy Burkov                              1
C. S. Lewis | The Chronicles of Narnia     1
Howard Pyle                                1
Леся Українка                              1
Name: Author, Length: 137, dtype: int64

In [9]:
all_books_df["Audiobook"].value_counts()

True     251
False     54
Name: Audiobook, dtype: int64