### Week6: Digital Humanities - Vector Space Model


In [1]:
import os
import lxml.etree
import tarfile
import collections
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import nltk
import nltk.tokenize
import math
import random


tf = tarfile.open('theatre-classique.tar.gz','r')
tf.extractall('data')

In [2]:
subgenres = ('Comédie', 'Tragédie', 'Tragi-comédie')
#print(subgenres)
plays, titles, genres = [], [], []
authors, years = [],[]

In [3]:
for fn in os.scandir('data/theatre-classique'):
    # Only include XML files
    if not fn.name.endswith('.xml'):
        continue
    tree   = lxml.etree.parse(fn.path)
    genre  = tree.find('//genre')
    title  = tree.find('//title')
    author = tree.find('//author')
    year   = tree.find('//date')
    if genre is not None and genre.text in subgenres:
        lines = []
        for line in tree.xpath('//l|//p'):
            lines.append(' '.join(line.itertext()))
        text = '\n'.join(lines)
        plays.append(text)
        genres.append(genre.text)
        titles.append(title.text)
        authors.append(author.text)
        if year is not None:
            years.append(year.text)

plays = np.array(plays)
genres = np.array(genres)
titles = np.array(titles)
authors = np.array(authors)
years = np.array(years)

In [4]:
print(len(plays), len(genres), len(titles), len(authors), len(years))


498 498 498 498 208


In [5]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Veda\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
def is_punct(string):
    """Check if STRING is a punctuation marker or a sequence of
       punctuation markers.
    """
    return PUNCT_RE.match(string) is not None

PUNCT_RE = re.compile(r'[^\w\s]+$')

In [7]:
def preprocess_text(text, language='French', lowercase=True):
    if lowercase:
        text = text.lower()
    if (language == 'French'):
        text = re.sub("-", " ", text)
        text = re.sub("l'", "le ", text)
        text = re.sub("d'", "de ", text)
        text = re.sub("c'", "ce ", text)
        text = re.sub("j'", "je ", text)
        text = re.sub("m'", "me ", text)
        text = re.sub("qu'", "que ", text)
        text = re.sub("'", " ' ", text)
        text = re.sub("quelqu'", "quelque ", text)
        text = re.sub("aujourd'hui", "aujourdhui", text)
    tokens = nltk.tokenize.word_tokenize(text, language=language)
    tokens = [token for token in tokens if not is_punct(token)]
    return tokens

In [8]:
plays_tok = [preprocess_text(play, 'French') for play in plays]


In [9]:
def extract_vocabulary(tokenized_corpus, min_count=1, max_count=float('inf')):
    vocabulary = collections.Counter()
    for document in tokenized_corpus:
        vocabulary.update(document)
    vocabulary = {word for word, count in vocabulary.items()
                  if count >= min_count and count <= max_count}
    return sorted(vocabulary)

vocabulary = extract_vocabulary(plays_tok, min_count=2)
print("Length of vocabulary: ",len(vocabulary))


Length of vocabulary:  38410


#### 1. Represent each play by a vector with only the tf component. You can apply some preprocessing before generating this vector representation.

In [10]:
def corpus2dtm(tokenized_corpus, vocabulary):
    document_term_matrix = []
    for document in tokenized_corpus:
        document_counts = collections.Counter(document)
        row = [document_counts[word] for word in vocabulary]
        document_term_matrix.append(row)
    return np.array(document_term_matrix)

In [11]:
document_term_matrix = np.array(corpus2dtm(plays_tok, vocabulary))
print(f"document-term matrix with "
      f"|D| = {document_term_matrix.shape[0]} documents and "
      f"|V| = {document_term_matrix.shape[1]} words.")

document-term matrix with |D| = 498 documents and |V| = 38410 words.


In [12]:
print("Converted doc into vectors :\n",document_term_matrix)
print("\nLength of matrix: \n", len(document_term_matrix))
print("\nSize of matrix: \n", document_term_matrix.shape)

Converted doc into vectors :
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]

Length of matrix: 
 498

Size of matrix: 
 (498, 38410)


#### 2. For each genre, it is possible to generate a “profile”, in the form of a single vector representing the entire set of plays corresponding to this genre. Build such a profile for each of the three genres (Comedy, Tragedy and Tragicomedy).

In [13]:
tr_means = document_term_matrix[np.array(genres) == 'Tragédie'].mean(axis=0)
co_means = document_term_matrix[genres == 'Comédie'].mean(axis=0)
tc_means = document_term_matrix[genres == 'Tragi-comédie'].mean(axis=0)

#### 3. How many terms with a weight strictly larger than 0 do you have in each text genre profile?

In [14]:
for genre_ in subgenres:
    mins = document_term_matrix[np.array(genres) == genre_].mean(axis=0)
    nonzero_terms = np.sum(mins > 0)
    print(f"Number of non-zero terms in {genre_} is {nonzero_terms}")


Number of non-zero terms in Comédie is 33990
Number of non-zero terms in Tragédie is 25642
Number of non-zero terms in Tragi-comédie is 16401


#### 4. Select randomly 10 plays for each text genre. Represent each play by a vector.

In [23]:
for genre_ in subgenres:
    genre_plays = document_term_matrix[np.array(genres) == genre_]
    genre_plays = genre_plays[np.random.randint(0, len(genre_plays), 10)]
    print(f"Each play in {genre_} is a dimension of {len(genre_plays[0])} with size {genre_plays.shape}")


Each play in Comédie is a dimension of 38410 with size (10, 38410)
Each play in Tragédie is a dimension of 38410 with size (10, 38410)
Each play in Tragi-comédie is a dimension of 38410 with size (10, 38410)


#### 5. For each text genre and play, how many terms with a weight strictly larger than 0 do you have in the vector?

In [24]:
for genre_ in subgenres:
    genre_plays = document_term_matrix[np.array(genres) == genre_]
    for index, play_ in enumerate(genre_plays):
        nonzero_terms = np.sum(play_ > 0)
        print(f"Number of terms > 0 in play {index}/{len(genre_plays)} of {genre_} is {nonzero_terms}")
    
    print("\n")


Number of terms > 0 in play 0/310 of Comédie is 1262
Number of terms > 0 in play 1/310 of Comédie is 1608
Number of terms > 0 in play 2/310 of Comédie is 610
Number of terms > 0 in play 3/310 of Comédie is 2675
Number of terms > 0 in play 4/310 of Comédie is 1305
Number of terms > 0 in play 5/310 of Comédie is 1148
Number of terms > 0 in play 6/310 of Comédie is 1394
Number of terms > 0 in play 7/310 of Comédie is 1346
Number of terms > 0 in play 8/310 of Comédie is 2670
Number of terms > 0 in play 9/310 of Comédie is 1245
Number of terms > 0 in play 10/310 of Comédie is 1512
Number of terms > 0 in play 11/310 of Comédie is 1622
Number of terms > 0 in play 12/310 of Comédie is 938
Number of terms > 0 in play 13/310 of Comédie is 943
Number of terms > 0 in play 14/310 of Comédie is 1205
Number of terms > 0 in play 15/310 of Comédie is 1350
Number of terms > 0 in play 16/310 of Comédie is 790
Number of terms > 0 in play 17/310 of Comédie is 1639
Number of terms > 0 in play 18/310 of Comé

Number of terms > 0 in play 146/150 of Tragédie is 2223
Number of terms > 0 in play 147/150 of Tragédie is 2302
Number of terms > 0 in play 148/150 of Tragédie is 2013
Number of terms > 0 in play 149/150 of Tragédie is 2247


Number of terms > 0 in play 0/38 of Tragi-comédie is 2690
Number of terms > 0 in play 1/38 of Tragi-comédie is 2665
Number of terms > 0 in play 2/38 of Tragi-comédie is 2552
Number of terms > 0 in play 3/38 of Tragi-comédie is 2662
Number of terms > 0 in play 4/38 of Tragi-comédie is 2475
Number of terms > 0 in play 5/38 of Tragi-comédie is 2189
Number of terms > 0 in play 6/38 of Tragi-comédie is 2734
Number of terms > 0 in play 7/38 of Tragi-comédie is 2073
Number of terms > 0 in play 8/38 of Tragi-comédie is 1939
Number of terms > 0 in play 9/38 of Tragi-comédie is 2686
Number of terms > 0 in play 10/38 of Tragi-comédie is 2230
Number of terms > 0 in play 11/38 of Tragi-comédie is 2411
Number of terms > 0 in play 12/38 of Tragi-comédie is 2478
Number of terms >

#### 6. For each text genre and play, how many terms with a weight strictly equal to 1 do you have in the vector?


In [25]:
for genre_ in subgenres:
    genre_plays = document_term_matrix[np.array(genres) == genre_]
    for index, play_ in enumerate(genre_plays):
        nonzero_terms = np.sum(play_ == 1)
        print(f"Number of terms == 1 in play {index}/{len(genre_plays)} of {genre_} is {nonzero_terms}")
    print("\n")


Number of terms == 1 in play 0/310 of Comédie is 758
Number of terms == 1 in play 1/310 of Comédie is 1026
Number of terms == 1 in play 2/310 of Comédie is 479
Number of terms == 1 in play 3/310 of Comédie is 1603
Number of terms == 1 in play 4/310 of Comédie is 809
Number of terms == 1 in play 5/310 of Comédie is 687
Number of terms == 1 in play 6/310 of Comédie is 841
Number of terms == 1 in play 7/310 of Comédie is 798
Number of terms == 1 in play 8/310 of Comédie is 1436
Number of terms == 1 in play 9/310 of Comédie is 731
Number of terms == 1 in play 10/310 of Comédie is 1067
Number of terms == 1 in play 11/310 of Comédie is 969
Number of terms == 1 in play 12/310 of Comédie is 610
Number of terms == 1 in play 13/310 of Comédie is 651
Number of terms == 1 in play 14/310 of Comédie is 684
Number of terms == 1 in play 15/310 of Comédie is 683
Number of terms == 1 in play 16/310 of Comédie is 460
Number of terms == 1 in play 17/310 of Comédie is 1005
Number of terms == 1 in play 18/3

Number of terms == 1 in play 140/150 of Tragédie is 1212
Number of terms == 1 in play 141/150 of Tragédie is 1165
Number of terms == 1 in play 142/150 of Tragédie is 1275
Number of terms == 1 in play 143/150 of Tragédie is 1183
Number of terms == 1 in play 144/150 of Tragédie is 1503
Number of terms == 1 in play 145/150 of Tragédie is 1161
Number of terms == 1 in play 146/150 of Tragédie is 1216
Number of terms == 1 in play 147/150 of Tragédie is 1234
Number of terms == 1 in play 148/150 of Tragédie is 1087
Number of terms == 1 in play 149/150 of Tragédie is 1165


Number of terms == 1 in play 0/38 of Tragi-comédie is 1419
Number of terms == 1 in play 1/38 of Tragi-comédie is 1401
Number of terms == 1 in play 2/38 of Tragi-comédie is 1497
Number of terms == 1 in play 3/38 of Tragi-comédie is 1366
Number of terms == 1 in play 4/38 of Tragi-comédie is 1249
Number of terms == 1 in play 5/38 of Tragi-comédie is 1097
Number of terms == 1 in play 6/38 of Tragi-comédie is 1454
Number of terms