diff --git a/data/__init__.py b/data/__init__.py new file mode 100644 index 0000000..ff32f22 --- /dev/null +++ b/data/__init__.py @@ -0,0 +1,37 @@ +"""Helper functions for loading data into multiple different representations.""" +from ast import literal_eval +from collections import Counter +from pathlib import Path + +import pandas as pd + + +def get_df(): + """Get the dataset unmodified in a pandas.DataFrame.""" + return pd.read_csv( + Path(__file__).parent.joinpath("haikus.csv"), + index_col=0, + # Ensure that the list of lines is interpreted as a list, not a string... + converters={"haiku": literal_eval}, + ) + + +def get_bag_of_words(): + """Get the dataset in a bag of words representation.""" + df = get_df() + bag = Counter() + for haiku in df["haiku"]: + for line in haiku: + bag.update(line.split()) + + return bag + + +def get_bag_of_lines(): + """Get the dataset in a bag of lines representation.""" + df = get_df() + lines = [] + for haiku in df["haiku"]: + for line in haiku: + lines.append(line) + return Counter(lines) diff --git a/experiments/eda/words.ipynb b/experiments/eda/words.ipynb index adbbb8e..3949a70 100644 --- a/experiments/eda/words.ipynb +++ b/experiments/eda/words.ipynb @@ -12,6 +12,49 @@ "* Potentially use to clean the dataset\n", "* British spellings" ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The autoreload extension is already loaded. To reload it, use:\n", + " %reload_ext autoreload\n" + ] + } + ], + "source": [ + "%load_ext autoreload\n", + "%autoreload 2\n", + "%aimport data\n", + "\n", + "%config InlineBackend.figure_format = 'svg'\n", + "%matplotlib inline\n", + "\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "sns.set()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "bag = data.get_bag_of_words()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": {