# Notebook Setup

In [None]:
from google.colab import drive
ROOT = "/content/drive"
print(ROOT)
drive.mount(ROOT)

%cd /content/drive/My Drive/CompHumanities/Data/Notebooks

In [None]:
!pwd

# Importing Libraries

In [None]:
##NLTK##
import nltk
#nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 


##STOPWORDS##
import spacy
from gensim.parsing.preprocessing import remove_stopwords
# import en_core_web_sm
# nlp = en_core_web_sm.load()


##TRANSFORMERS##
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

##HATHI##
#!pip install htrc-feature-reader
import glob
import random
from htrc_features import FeatureReader, Volume, Page

##WIKI##
#!pip install wikipedia
import wikipedia
import sys
from gensim.corpora import WikiCorpus

##OTHERS##
import numpy as np
import pandas as pd
from os import listdir, path
from os.path import isfile, join
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from scipy import sparse
import operator
from tqdm import tqdm
from collections import Counter
from nltk import FreqDist
random.seed(0)
np.random.seed(0)

In [None]:
from sklearn.model_selection import train_test_split

## Quick PKL IMPORTS

In [None]:
#FictionDF = pd.read_pickle('FictionDF.pkl')

In [None]:
#NonFicDF = pd.read_pickle('NonFicDF.pkl')

# Data Pipeline

## Hathi Data

The data we'll use comes from the [HathiTrust Extracted Features dataset](https://analytics.hathitrust.org/datasets).

We've sampled **SPECIFY SIZE of fiction>** from Ted Underwood's [metadata](https://github.com/tedunderwood/hathimetadata). This fiction data will act as the non-dream data

In [None]:
def get_feature_reader(path):
    paths = glob.glob(path+"/*bz2", recursive=True)
    return FeatureReader(paths)

In [None]:
def get_data(vols, label):
    
    df = []
    
    for vol in tqdm(vols.volumes(), total=500):        
        chunks = vol.tokenlist(chunk=True, chunk_target=250)
        chunks = chunks.reset_index()
        df.append(chunks)
    df = pd.concat(df)
    
    return df

**Fiction Data** , label = 1

In [None]:
FictionDF = get_data(get_feature_reader("data/fiction"),1)

100%|██████████| 500/500 [02:43<00:00,  3.05it/s]


In [None]:
FictionDF = FictionDF.drop(columns='section').rename(columns={'count':'counts'})
FictionDF.head()

Unnamed: 0,chunk,token,pos,counts
0,1,'','',1
1,1,",",",",17
2,1,.,.,8
3,1,.,UNK,2
4,1,1989,CD,3


In [None]:
#Only pick those chunks which are having less than or equal to 250 tokens
FictionDFGROUP  = FictionDF.groupby('chunk').count()
FictionDFGROUP = FictionDFGROUP[FictionDFGROUP['token']<=250]
FictionDFGROUP = FictionDFGROUP.reset_index()

In [None]:
#Randomly select 300 chunks for our sample
NumChunks = 300 if len(FictionDFGROUP) > 300 else len(FictionDFGROUP)
randChunks = random.sample(list(FictionDFGROUP['chunk'].unique()), NumChunks)
FicDFSample = FictionDF[FictionDF.chunk.isin(randChunks)]

In [None]:
#Sanity Check
FicDFSample.groupby('chunk').count()

Unnamed: 0_level_0,token,pos,counts
chunk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
744,241,241,241
758,189,189,189
761,170,170,170
763,178,178,178
764,180,180,180
...,...,...,...
1385,192,192,192
1386,185,185,185
1390,202,202,202
1392,191,191,191


In [None]:
FictionDF.to_pickle('FictionDF.pkl')

In [None]:
FicDFSample.to_pickle('FicDFSample.pkl')

**Non Fiction Data** , label = 0

In [None]:
NonFicDF = get_data(get_feature_reader("data/nonfiction"),0)

100%|██████████| 500/500 [03:17<00:00,  2.53it/s]


In [None]:
NonFicDF = NonFicDF.drop(columns='section').rename(columns={'count':'counts'})
NonFicDF.head()

Unnamed: 0,chunk,token,pos,counts
0,1,""".:***",UNK,1
1,1,','',2
2,1,',POS,1
3,1,",",",",13
4,1,-,:,1


In [None]:
#Only pick those chunks which are having less than or equal to 250 tokens
NonFicDFGROUP  = NonFicDF.groupby('chunk').count()
NonFicDFGROUP = NonFicDFGROUP[NonFicDFGROUP['token']<=250]
NonFicDFGROUP = NonFicDFGROUP.reset_index()

In [None]:
#Randomly select 300 chunks for our sample
NumChunks = 300 if len(NonFicDFGROUP) > 300 else len(NonFicDFGROUP)
randChunks = random.sample(list(NonFicDFGROUP['chunk'].unique()), NumChunks)
NonFicDFSample = NonFicDF[NonFicDF.chunk.isin(randChunks)]

In [None]:
#Sanity Check
NonFicDFSample.groupby('chunk').count()

Unnamed: 0_level_0,token,pos,counts
chunk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1043,118,118,118
1044,206,206,206
1045,231,231,231
1046,245,245,245
1048,250,250,250
1051,222,222,222
1054,243,243,243
1055,182,182,182
1056,190,190,190
1057,207,207,207


In [None]:
NonFicDF.to_pickle('NonFicDF.pkl')

In [None]:
NonFicDFSample.to_pickle('NonFicDFSample.pkl')

## DreamBank Data

In [None]:
dreams = pd.read_csv("DreamBank.csv")

In [None]:
dreams['split'] = dreams['content'].apply(lambda x : nltk.word_tokenize(x.lower()))

In [None]:
dreams['counts'] =dreams.split.apply(lambda x: Counter(x))

In [None]:
dreams = dreams.loc[:,['split','counts']]

In [None]:
dreamsDF = pd.DataFrame.from_records(dreams.counts.values.tolist()).stack().reset_index().rename(columns={'level_0':'chunk','level_1':'tokens',0:'counts'})

In [None]:
dreamsDF['pos'] = dreamsDF['tokens'].apply(lambda x : (nltk.pos_tag([x]))[0][1] )

In [None]:
#Only pick those chunks which are having less than or equal to 250 tokens
dreamsDFGROUP  = dreamsDF.groupby('chunk').count()
dreamsDFGROUP = dreamsDFGROUP[dreamsDFGROUP['tokens']<=250]
dreamsDFGROUP = dreamsDFGROUP.reset_index()

In [None]:
#Randomly select 300 chunks for our sample
NumChunks = 300 if len(dreamsDFGROUP) > 300 else len(dreamsDFGROUP)
randChunks = random.sample(list(dreamsDFGROUP['chunk'].unique()), NumChunks)
dreamsDFSample = dreamsDF[dreamsDF.chunk.isin(randChunks)]

In [None]:
#Sanity Check
dreamsDFSample.groupby('chunk').count()

Unnamed: 0_level_0,tokens,counts,pos
chunk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
44,73,73,73
46,177,177,177
78,74,74,74
176,149,149,149
237,56,56,56
...,...,...,...
25966,33,33,33
26104,144,144,144
26184,124,124,124
26262,136,136,136


In [None]:
#dreamsDF = pd.read_pickle('dreamsDF.pkl')

In [None]:
#dreamsDF.to_pickle('dreamsDF.pkl')

In [None]:
dreamsDFSample.to_pickle('dreamsDFSample.pkl')

## Wiki Data

In [None]:
# def make_corpus(in_f, out_f):

#     """Convert Wikipedia xml dump file to text corpus"""

#     output = open(out_f, 'w')
#     wiki = WikiCorpus(in_f)

#     i = 0
#     for text in wiki.get_texts():
#         output.write(bytes(' '.join(text), 'utf-8').decode('utf-8') + '\n')
#         i = i + 1
#         if (i % 10000 == 0):
#             print('Processed ' + str(i) + ' articles')
#     output.close()
#     print('Processing complete!')

##Reading corpus##

# in_f = "enwiki-latest-pages-articles.xml.bz2"
# out_f = "wiki_en.txt"
# make_corpus(in_f, out_f)

In [None]:
wiki = pd.read_csv("wiki_en.txt", header = None).rename(columns={0:'content'})

In [None]:
wiki['content'] = wiki.content.apply(lambda x: [' '.join(x.split()[i:i+n]) for i in range(0,len(x.split()),250)])

In [None]:
wiki = wiki.explode('content')

In [None]:
wiki['split'] = wiki['content'].apply(lambda x : nltk.word_tokenize(x.lower()))

In [None]:
wiki['counts'] = wiki.split.apply(lambda x: Counter(x))

In [None]:
wiki = wiki.loc[:,['split','counts']]

In [None]:
wikiDF = pd.DataFrame.from_records(wiki.counts.values.tolist()).stack().reset_index().rename(columns={'level_0':'chunk','level_1':'tokens',0:'counts'})

In [None]:
wikiDF['pos'] = wikiDF['tokens'].apply(lambda x : (nltk.pos_tag([x]))[0][1] )

In [None]:
wikiDF.head()

Unnamed: 0,chunk,tokens,counts,pos
0,0,anarchism,5.0,NN
1,0,is,4.0,VBZ
2,0,political,2.0,JJ
3,0,philosophy,1.0,NN
4,0,and,10.0,CC


In [None]:
wikiDF.to_pickle('wikiDF.pkl')

In [None]:
#Only pick those chunks which are having less than or equal to 250 tokens
wikiDFGROUP  = wikiDF.groupby('chunk').count()
wikiDFGROUP = wikiDFGROUP[wikiDFGROUP['tokens']<=250]
wikiDFGROUP = wikiDFGROUP.reset_index()

In [None]:
#Randomly select 300 chunks for our sample
NumChunks = 300 if len(wikiDFGROUP) > 300 else len(wikiDFGROUP)
randChunks = random.sample(list(wikiDFGROUP['chunk'].unique()), NumChunks)
wikiDFSample = wikiDF[wikiDF.chunk.isin(randChunks)]

In [None]:
#Sanity Check
wikiDFSample.groupby('chunk').count()

Unnamed: 0_level_0,tokens,counts,pos
chunk,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
24,151,151,151
116,130,130,130
298,151,151,151
353,156,156,156
381,147,147,147
...,...,...,...
11932,165,165,165
11946,147,147,147
11976,154,154,154
12010,149,149,149


In [None]:
wikiDFSample.to_pickle('wikiDFSample.pkl')

# REMOVING STOPWORDS

In [None]:
def stopword_gensim(text):
    filtered_sentence = remove_stopwords(text)
    return(filtered_sentence)

# PIPELINE

In [None]:
dreamsDF.content = dreamsDF.content.apply(lambda x: stopword_gensim(x))

In [None]:
text_clf = Pipeline(
    [(  'vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', MultinomialNB()
                     )])

text_clf = text_clf.fit(dreamsDF.content, dreamsDF.target)
text_clf

Pipeline(steps=[('vect', CountVectorizer()), ('tfidf', TfidfTransformer()),
                ('clf', MultinomialNB())])