In [1]:
import pandas as pd

# Data Ingestion

First, let us import and briefly summarize the dataset.

In [2]:
PATH_TO_DATA = "../data/raw_data.csv"
raw_df = pd.read_csv(PATH_TO_DATA)

# Output meta data
print("Length: {}".format(len(raw_df)))
print("Columns: {}".format(raw_df.columns.values))
print("# of Authors: {}".format(len(raw_df["author"].unique())))
print("# of Schools: {}".format(len(raw_df["school"].unique())))

Length: 360808
Columns: ['title' 'author' 'school' 'sentence_spacy' 'sentence_str'
 'original_publication_date' 'corpus_edition_date' 'sentence_length'
 'sentence_lowered' 'tokenized_txt' 'lemmatized_str']
# of Authors: 36
# of Schools: 13


# Preprocessing

Let us consider a subset of the data containing the columns: title, author, school, original_publication_date, corpus_edition_date, sentence_str, tokenized_txt.

We should omit columns such as lemmatized_str. Instead, we should remove stop words and lemmetized the sentences ourselves to ensure data integrity.

In [3]:
relevant_cols = ["title", "author", "school",
                 "original_publication_date", "corpus_edition_date",
                 "sentence_str", "tokenized_txt"]

df = raw_df.loc[:, relevant_cols].copy(deep=True)

In [None]:
import ast
from lib.build_features import get_clean_sentence, get_vectorized_sentences

df["lemmatized_str"] = df["tokenized_txt"].apply(
    lambda row: get_clean_sentence(ast.literal_eval(row))
)