# Goal: 
To obtain a txt file for each work with metadata as: title and year of publication.

## Download the pickel pre-corpus

In [None]:
import pickle
import re

In [None]:
corpus= pickle.load( open( "corpus.pickle", "rb" ) )

## Remove unwanted characters

* °syl-lables
* ae
* pages

In [None]:
patterns=[("°(.*)\n",'\n'),("æ",'ae'),("Æ",'Ae'),("―(\s[0-9]+\s)―"," ")]
clean_corpus=[]

for title,text in corpus:
    for old,new in patterns:
        text=re.sub(old,new,text)
    clean_corpus.append([title,text])

In [None]:
import pickle
pickle.dump( clean_corpus, open( "clean_corpus.pickle", "wb" ) )

# Split and assign dates

The goal is to assign to each work the year of pubblication.

final_corpus =\[ {"title": title, "year", year_of_pubblication, "content": text} , ... \]

Some works are already isolated, other are not.

* understand which works are isolated and which are not
* understand how to split the ones with more works inside

#### Considerations: 
it doesn't matter for the aim of this project to really have a precise split for each work, what really matter is to have the correct assignment between text and year. Especially considering that only secondary works are not already splitted. More than that, it is worth noting that the year of pubblicashion is just a proxy of the period in which a work was written; perhaps earlier works were published later due to pubblicashion issues, but for a such large corpus this should not be a problem. Later on may be interesting to ask to some expertise if some work have a longer period of writting with respect to others and see if it have some impact or can be detected by the analysis.

In [None]:
corpus= pickle.load( open( "clean_corpus.pickle", "rb" ) )

In [None]:
titles=[x for x,y in corpus]

In [None]:
len(titles)

In [None]:
titles[10]

In [None]:
titles_indexed=[(x,corpus.index([x,y])) for x,y in corpus]
titles_indexed[:5]

In [None]:
#Trivial information retrieval system, search works per words present in the title
title="STUDIES IN LOGICAL THEORY"
indexes=[corpus.index([x,y]) for x,y in corpus if title in x]
indexes

In [None]:
#given the index take the info
def info(i):
    volume=titles_indexed[i][0].split(':')[0].strip()
    work=titles_indexed[i][0].split(':')[2].strip().capitalize()
    date=titles_indexed[i][0].split(':')[1].split(',',1)[0].strip()
    information={'Volume: ':volume,"Title: ":work,'Date: ':date, 'Index':i}
    return information

In [None]:
info(1)

In [None]:
infos=[info(i) for i in range(len(corpus))]

In [None]:
infos[:3]

### Testing the regex pattern matching and splitting

Not many works have unique year, but what if I cannot trust the date indicated by titles
Should I really check any text?

Please note the use of \[ as a essay delimiter, but not only

Or maybe better, [First pubblished ... ]

\[First published(.|\n[^»*])*

### Splitting function and testing its behaviuor

Need a function that just take a text and split in sublist of (title, info of first pubblication, text)

In [None]:
def needed_split(text):
    x=re.split("(\[(.[^»*])*First published(.|\n[^»*])*)",text, maxsplit=10000000) #split the recurrent pattern of infos
    x=[X for X in x if X!=']'and X!='. 'and X!=None] #get rid of useless elements
    y=list()
    if len(x)>1: #at least one occurence of the pattern
        for i in range(len(x)):
            if i%2==0and i>1: #if it is a text + title
                z=re.split("((.+)$)",x[i])
                y.append(z[0])
                y.append(z[1].strip())
            else: #otherwise is already isolated
                y.append(x[i])
        return y[:-1] #get rid of the last useless element
    else: #zero occurence of the pattern
        y=x
        return y

In [None]:
splitted=needed_split(corpus[11][1]) #1, 6, 7, 10

In [None]:
len(splitted)

In [None]:
print(splitted[0])

In [None]:
print(splitted[1])

In [None]:
print(splitted[2][:500])

In [None]:
print(splitted[3])

In [None]:
print(splitted[4])

In [None]:
print(splitted[5])

In [None]:
print(splitted[-1])

In [None]:
print(splitted[-2])

In [None]:
print(splitted[-3])

In [None]:
print(splitted[-4])

### One possible solution

#### Define a function that use the needed_split() to obtain a raw final corpus

[volume title: ... , title: ... , pubblication infos: ... , text: ,,, ]

#### Problem:

single text have a different format

#### Solution:

add a new level..

[{'volume title': ... , content: [text]} , {'volume title': ... , content: [(title, info, text), ... ]}]

#### Obs:

This structure is information keeper but over articolated

### KISS SOLUTION

#### Just obtain the corpus that I need

* extract the year
* stay faithful to: final_corpus =\[ {"title": title, "year", year_of_pubblication, "content": text} , ... \]
* title may be a concatenation of volume and real title

#### extracting the year experiment

* first case: a unique text
* second case: set of essays

In [None]:
#first case
splitted=needed_split(corpus[10][1])
len(splitted)

In [None]:
print(splitted[0][:500])

In [None]:
date=re.findall('(\d{4})', splitted[0])
print(date)
#take the first date

In [None]:
#second case
splitted=needed_split(corpus[2][1])
len(splitted)

In [None]:
date=re.findall('(\d{4})', splitted[1])
print(date)
#take the last date, does exists other strange cases other than the first essays of the eighth work?

## Finally

In [None]:
final_corpus=list()
for j in range(len(corpus)):
    general_title=corpus[j][0]
    content=corpus[j][1]
    list_of_works=needed_split(content)
    if len(list_of_works)==1:
        title=infos[j]['Volume: ']+' / '+infos[j]['Title: ']
        dates=re.findall('(\d{4})', list_of_works[0])
        for ye in dates:
            d=int(ye)
            if d>=1870 and d<=1952:
                date=d
                break
        text=content
        final_corpus.append({'title: ': title, 'year: ': date, 'text: ': text})
    elif len(list_of_works)>1:
        for i in range(0,len(list_of_works)-1,3): # i is the index of the title
            title=infos[j]['Volume: ']+' / '+infos[j]['Title: ']+' / '+list_of_works[i]
            dates=re.findall('(\d{4})', list_of_works[i+1])
            if len(dates)>0:
                for ye in dates:
                    d=int(ye)
                    if d>=1870 and d<=1952:
                        date=d
                        break
            else: #if no year is clearly indicated then take the year from the general title as a proxy, STUDIES IN LOGICAL THEORY  3,6,9 
                date=infos[j]['Date: ']
            text=list_of_works[i+2]
            final_corpus.append({'title: ': title, 'year: ': date, 'text: ': text})

In [None]:
len(final_corpus)

In [None]:
final_corpus[144]

In [None]:
print(final_corpus[144]['text: '][:400])

In [None]:
#clean from »*
patterns=[("[»*]",'')]
for work in final_corpus:
    for old,new in patterns:
        work['text: ']=re.sub(old,new,work['text: '])

In [None]:
pickle.dump( final_corpus, open( "splitted_corpus.pickle", "wb" ) )

### Problems:

* use of a not completely safe heuristics
* how to solve the range of years problem?

In [None]:
final_corpus= pickle.load( open( "splitted_corpus.pickle", "rb" ) )

In [None]:
final_corpus[1035]

In [None]:
final_corpus[173]

In [None]:
# ad hoc solution
for i in range(141,145):
    final_corpus[i]['year: ']='1903'

In [None]:
final_corpus.pop(171)

In [None]:
# ad hoc solution
for i in range(1035,1039):
    final_corpus[i]['year: ']='1918'

In [None]:
#problem solved
[(final_corpus.index(x),x['year: ']) for x in final_corpus if len(str(x['year: ']))!=4]

# PROBLEMS 2.0

### YEARS DISTRIBUTION

In [None]:
years=[]
count=[]
for work in final_corpus:
    y=int(work['year: '])
    if y in years:
        count[years.index(y)]+=1
    else:
        years.append(y)
        count.append(1)
        
years_count=dict(zip(years,count))

In [None]:
years_count

## MISSED SPLIT

In [None]:
problems=[x['title: '] for x in final_corpus if len(x['title: '])>500]

In [None]:
len(problems)

In [None]:
problems[12]

In [None]:
problems[2]

#### Some titles had ingested text due to pattern missing, as they seems to be an irrelevant part of the whole corpus, I just get rid of them

In [None]:
final_corpus=[x for x in final_corpus if len(x['title: '])<500]

In [None]:
pickle.dump( final_corpus, open( "splitted_corpus.pickle", "wb" ) )