### Week6: Digital Humanities - Vector Space Model


In [1]:
import os
import lxml.etree
import tarfile
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import nltk
import nltk.tokenize
import math
import random


tf = tarfile.open('theatre-classique.tar.gz','r')
tf.extractall('data')

In [2]:
subgenres = ('Comédie', 'Tragédie', 'Tragi-comédie')
#print(subgenres)
plays, titles, genres = [], [], []
authors, years = [],[]

In [3]:
for fn in os.scandir('data/theatre-classique'):
    # Only include XML files
    if not fn.name.endswith('.xml'):
        continue
    tree   = lxml.etree.parse(fn.path)
    genre  = tree.find('//genre')
    title  = tree.find('//title')
    author = tree.find('//author')
    year   = tree.find('//date')
    if genre is not None and genre.text in subgenres:
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays.append(text)
        genres.append(genre.text)
        titles.append(title.text)
        authors.append(author.text)
        if year is not None:
            years.append(year.text)

plays = np.array(plays)
genres = np.array(genres)
titles = np.array(titles)
authors = np.array(authors)
years = np.array(years)

In [4]:
print(len(plays), len(genres), len(titles), len(authors), len(years))


498 498 498 498 208


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Veda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def is_punct(string):
    """Check if STRING is a punctuation marker or a sequence of
       punctuation markers.
    """
    return PUNCT_RE.match(string) is not None

PUNCT_RE = re.compile(r'[^\w\s]+$')

In [7]:
def preprocess_text(text, language='French', lowercase=True):
    if lowercase:
        text = text.lower()
    if (language == 'French'):
        text = re.sub("-", " ", text)
        text = re.sub("l'", "le ", text)
        text = re.sub("d'", "de ", text)
        text = re.sub("c'", "ce ", text)
        text = re.sub("j'", "je ", text)
        text = re.sub("m'", "me ", text)
        text = re.sub("qu'", "que ", text)
        text = re.sub("'", " ' ", text)
        text = re.sub("quelqu'", "quelque ", text)
        text = re.sub("aujourd'hui", "aujourdhui", text)
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    tokens = [token for token in tokens if not is_punct(token)]
    return tokens

In [8]:
plays_tok = [preprocess_text(play, 'French') for play in plays]


In [9]:
def extract_vocabulary(tokenized_corpus, min_count=1, max_count=float('inf')):
    vocabulary = collections.Counter()
    for document in tokenized_corpus:
        vocabulary.update(document)
    vocabulary = {word for word, count in vocabulary.items()
                  if count >= min_count and count <= max_count}
    return sorted(vocabulary)

vocabulary = extract_vocabulary(plays_tok, min_count=2)
print("Length of vocabulary: ",len(vocabulary))


Length of vocabulary:  38410


#### 1. Represent each play by a vector with only the tf component. You can apply some preprocessing before generating this vector representation.

In [10]:
def corpus2dtm(tokenized_corpus, vocabulary):
    document_term_matrix = []
    for document in tokenized_corpus:
        document_counts = collections.Counter(document)
        row = [document_counts[word] for word in vocabulary]
        document_term_matrix.append(row)
    return np.array(document_term_matrix)

In [11]:
document_term_matrix = np.array(corpus2dtm(plays_tok, vocabulary))
print(f"document-term matrix with "
      f"|D| = {document_term_matrix.shape[0]} documents and "
      f"|V| = {document_term_matrix.shape[1]} words.")

document-term matrix with |D| = 498 documents and |V| = 38410 words.


In [12]:
print("Converted doc into vectors :\n",document_term_matrix)
print("\nLength of matrix: \n", len(document_term_matrix))
print("\nSize of matrix: \n", document_term_matrix.shape)

Converted doc into vectors :
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Length of matrix: 
 498

Size of matrix: 
 (498, 38410)


#### 2. For each genre, it is possible to generate a “profile”, in the form of a single vector representing the entire set of plays corresponding to this genre. Build such a profile for each of the three genres (Comedy, Tragedy and Tragicomedy).

In [13]:
tr_means = document_term_matrix[np.array(genres) == 'Tragédie'].mean(axis=0)
co_means = document_term_matrix[genres == 'Comédie'].mean(axis=0)
tc_means = document_term_matrix[genres == 'Tragi-comédie'].mean(axis=0)

#### 3. How many terms with a weight strictly larger than 0 do you have in each text genre profile?