In [1]:
import pandas
import re

In [2]:
# Load genius data 
genius_data = pandas.read_csv("./genius_2956.csv").dropna().astype("string")

genius_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2956 entries, 0 to 2955
Data columns (total 1 columns):
 #   Column                  Non-Null Count  Dtype 
---  ------                  --------------  ----- 
 0   genius_smart_collector  2956 non-null   string
dtypes: string(1)
memory usage: 23.2 KB


In [3]:
# Load short stories data
short_stories_data = pandas.read_csv("./scraper_600_600_25.csv").astype("string")

short_stories_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6871 entries, 0 to 6870
Data columns (total 3 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   dutch_short_stories  556 non-null    string
 1   1001_stories         805 non-null    string
 2   1001_gedichten       6871 non-null   string
dtypes: string(3)
memory usage: 161.2 KB


In [4]:
# Split into dutch_short_stories, 1001_stories and 1001_poems datasets
dutch_short_stories = short_stories_data["dutch_short_stories"]
stories_1001 = short_stories_data["1001_stories"]
poems_1001 = short_stories_data["1001_gedichten"]

In [5]:
def capitalize_after_empty_or_newline(text):
    lines = re.split(r'(\n)', text)
    capitalized_lines = [line.capitalize() if line.strip() else line for line in lines]
    
    return ''.join(capitalized_lines)


In [6]:
# Capitalize first letter of each line in genius data
genius_data["genius_smart_collector"] = genius_data["genius_smart_collector"].apply(capitalize_after_empty_or_newline)

genius_data.head()

Unnamed: 0,genius_smart_collector
0,Kleine jongen\nJe bent op deze wereld\nDus zal...
1,Ze lag te slapen\nK vroeg haar gisteravond\nWa...
2,Ik heb het goed gedaan\nMaar ook zo fout gedaa...
3,Zeg maar niets meer\nIk ga wel weg als je dat ...
4,Ik sluit mn ogen en denk na\nEn alles gaat dan...


In [7]:
# Remove short stories shorter than 200 characters for every short story dataset
dutch_short_stories = dutch_short_stories[dutch_short_stories.str.len() > 200]
stories_1001 = stories_1001[stories_1001.str.len() > 200]
poems_1001 = poems_1001[poems_1001.str.len() > 200]

print(f"{dutch_short_stories.info()}\n")
print(f"{stories_1001.info()}\n")
print(poems_1001.info())

<class 'pandas.core.series.Series'>
Int64Index: 548 entries, 0 to 555
Series name: dutch_short_stories
Non-Null Count  Dtype 
--------------  ----- 
548 non-null    string
dtypes: string(1)
memory usage: 8.6 KB
None

<class 'pandas.core.series.Series'>
Int64Index: 799 entries, 0 to 804
Series name: 1001_stories
Non-Null Count  Dtype 
--------------  ----- 
799 non-null    string
dtypes: string(1)
memory usage: 12.5 KB
None

<class 'pandas.core.series.Series'>
Int64Index: 6161 entries, 0 to 6869
Series name: 1001_gedichten
Non-Null Count  Dtype 
--------------  ----- 
6161 non-null   string
dtypes: string(1)
memory usage: 96.3 KB
None


In [8]:
def shorten_story(story):
    if len(story) <= 2400:
        return story
    else:
        truncated_story = story[:2400]
        if truncated_story[-1] == '\n':
            return truncated_story
        else:
            last_newline_index = truncated_story.rfind('\n')
            if last_newline_index != -1:
                return truncated_story[:last_newline_index]
            else:
                return truncated_story


In [9]:
# Print average length of each dataset
print(f"Average length of dutch_short_stories: {dutch_short_stories.str.len().mean()}")
print(f"Average length of stories_1001: {stories_1001.str.len().mean()}")
print(f"Average length of poems_1001: {poems_1001.str.len().mean()}")
print(f"Average length of genius_smart_collector: {genius_data['genius_smart_collector'].str.len().mean()}\n")

# Print max length of each dataset
print(f"Max length of dutch_short_stories: {dutch_short_stories.str.len().max()}")
print(f"Max length of stories_1001: {stories_1001.str.len().max()}")
print(f"Max length of poems_1001: {poems_1001.str.len().max()}")
print(f"Max length of genius_smart_collector: {genius_data['genius_smart_collector'].str.len().max()}")

Average length of dutch_short_stories: 1781.1259124087592
Average length of stories_1001: 5514.879849812265
Average length of poems_1001: 521.729751663691
Average length of genius_smart_collector: 1183.3132611637348

Max length of dutch_short_stories: 6554
Max length of stories_1001: 64542
Max length of poems_1001: 7314
Max length of genius_smart_collector: 8438


In [10]:
# Apply shorten_story to all short story datasets
dutch_short_stories = dutch_short_stories.apply(shorten_story)
stories_1001 = stories_1001.apply(shorten_story)
poems_1001 = poems_1001.apply(shorten_story)

# Print max length of each dataset
print(f"Max length dutch_short_stories: {dutch_short_stories.str.len().max()}")
print(f"Max length stories_1001: {stories_1001.str.len().max()}")
print(f"Max length poems_1001: {poems_1001.str.len().max()}")

Max length dutch_short_stories: 2400
Max length stories_1001: 2400
Max length poems_1001: 2400


In [11]:
# Remove all numbers from the stories and poems
dutch_short_stories = dutch_short_stories.str.replace('\d+', '')
stories_1001 = stories_1001.str.replace('\d+', '')
poems_1001 = poems_1001.str.replace('\d+', '')

# Capitalize first letter of each line in all short story datasets
dutch_short_stories = dutch_short_stories.apply(capitalize_after_empty_or_newline)
stories_1001 = stories_1001.apply(capitalize_after_empty_or_newline)
poems_1001 = poems_1001.apply(capitalize_after_empty_or_newline)

  dutch_short_stories = dutch_short_stories.str.replace('\d+', '')
  stories_1001 = stories_1001.str.replace('\d+', '')
  poems_1001 = poems_1001.str.replace('\d+', '')


In [12]:
# Concat all short story datasets into one dataset
short_stories = pandas.concat([dutch_short_stories, stories_1001]).astype("string")

short_stories.info()

<class 'pandas.core.series.Series'>
Int64Index: 1347 entries, 0 to 804
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
1347 non-null   string
dtypes: string(1)
memory usage: 21.0 KB


In [13]:
# Save short stories dataset to csv
short_stories.to_csv("./short_stories_1347.csv", index=False)

# Save poems dataset to csv
poems_1001.to_csv("./gedichten_6161.csv", index=False)

# Save genius dataset to csv
genius_data.to_csv("./genius_data_2956.csv", index=False)

In [14]:
# Save as one big dataset
all_data = pandas.concat([short_stories, poems_1001, genius_data["genius_smart_collector"]]).astype("string")

all_data.info()

<class 'pandas.core.series.Series'>
Int64Index: 10464 entries, 0 to 2955
Series name: None
Non-Null Count  Dtype 
--------------  ----- 
10464 non-null  string
dtypes: string(1)
memory usage: 163.5 KB


In [15]:
# Save all data to csv
all_data.to_csv("./all_data_10464.csv", index=False)