Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add helper functions for accessing the dataset.
- Loading branch information
Showing
2 changed files
with
80 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
"""Helper functions for loading data into multiple different representations.""" | ||
from ast import literal_eval | ||
from collections import Counter | ||
from pathlib import Path | ||
|
||
import pandas as pd | ||
|
||
|
||
def get_df(): | ||
"""Get the dataset unmodified in a pandas.DataFrame.""" | ||
return pd.read_csv( | ||
Path(__file__).parent.joinpath("haikus.csv"), | ||
index_col=0, | ||
# Ensure that the list of lines is interpreted as a list, not a string... | ||
converters={"haiku": literal_eval}, | ||
) | ||
|
||
|
||
def get_bag_of_words(): | ||
"""Get the dataset in a bag of words representation.""" | ||
df = get_df() | ||
bag = Counter() | ||
for haiku in df["haiku"]: | ||
for line in haiku: | ||
bag.update(line.split()) | ||
|
||
return bag | ||
|
||
|
||
def get_bag_of_lines(): | ||
"""Get the dataset in a bag of lines representation.""" | ||
df = get_df() | ||
lines = [] | ||
for haiku in df["haiku"]: | ||
for line in haiku: | ||
lines.append(line) | ||
return Counter(lines) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters