Skip to content

Commit

Permalink
Add helper functions for accessing the dataset.
Browse files Browse the repository at this point in the history
  • Loading branch information
Notgnoshi committed Mar 16, 2019
1 parent b5d77f8 commit 7635d3d
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 0 deletions.
37 changes: 37 additions & 0 deletions data/__init__.py
@@ -0,0 +1,37 @@
"""Helper functions for loading data into multiple different representations."""
from ast import literal_eval
from collections import Counter
from pathlib import Path

import pandas as pd


def get_df():
"""Get the dataset unmodified in a pandas.DataFrame."""
return pd.read_csv(
Path(__file__).parent.joinpath("haikus.csv"),
index_col=0,
# Ensure that the list of lines is interpreted as a list, not a string...
converters={"haiku": literal_eval},
)


def get_bag_of_words():
"""Get the dataset in a bag of words representation."""
df = get_df()
bag = Counter()
for haiku in df["haiku"]:
for line in haiku:
bag.update(line.split())

return bag


def get_bag_of_lines():
"""Get the dataset in a bag of lines representation."""
df = get_df()
lines = []
for haiku in df["haiku"]:
for line in haiku:
lines.append(line)
return Counter(lines)
43 changes: 43 additions & 0 deletions experiments/eda/words.ipynb
Expand Up @@ -12,6 +12,49 @@
"* Potentially use to clean the dataset\n",
"* British spellings"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The autoreload extension is already loaded. To reload it, use:\n",
" %reload_ext autoreload\n"
]
}
],
"source": [
"%load_ext autoreload\n",
"%autoreload 2\n",
"%aimport data\n",
"\n",
"%config InlineBackend.figure_format = 'svg'\n",
"%matplotlib inline\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"sns.set()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"bag = data.get_bag_of_words()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
Expand Down

0 comments on commit 7635d3d

Please sign in to comment.