In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import brown, stopwords

In [2]:
# import corpus here
# nltk.download() 

In [3]:
print(brown.categories())


['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


## I chose to only use 5 categories in my work here because I did not want overload my kernel as I am on a MacBookAir and do not have unlimited space in memory. 

In [39]:
raw_adventure = brown.sents(categories='adventure')
raw_lore = brown.raw(categories='lore')
raw_mystery = brown.raw(categories='mystery')
raw_romance = brown.raw(categories='romance')
raw_science_fiction = brown.raw(categories='science_fiction')



In [114]:
rawraw_adventure = brown.words(categories='adventure')
rawraw_adventure[:30]

['Dan',
 'Morgan',
 'told',
 'himself',
 'he',
 'would',
 'forget',
 'Ann',
 'Turner',
 '.',
 'He',
 'was',
 'well',
 'rid',
 'of',
 'her',
 '.',
 'He',
 'certainly',
 "didn't",
 'want',
 'a',
 'wife',
 'who',
 'was',
 'fickle',
 'as',
 'Ann',
 '.',
 'If']

In [67]:
joined_adventures = [" ".join(sent) for sent in raw_adventure]
joined_lore = [" ".join(sent) for sent in raw_lore]
joined_mystery = [" ".join(sent) for sent in raw_mystery]
joined_romance = [" ".join(sent) for sent in raw_romance]
joined_science_fiction = [" ".join(sent) for sent in raw_science_fiction]

In [115]:
joined_adventures[:20]

['Dan Morgan told himself he would forget Ann Turner .',
 'He was well rid of her .',
 "He certainly didn't want a wife who was fickle as Ann .",
 "If he had married her , he'd have been asking for trouble .",
 'But all of this was rationalization .',
 'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .',
 'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing .',
 "The easiest thing would be to sell out to Al Budd and leave the country , but there was a stubborn streak in him that wouldn't allow it .",
 'The best antidote for the bitterness and disappointment that poisoned him was hard work .',
 'He found that if he was tired enough at night , he went to sleep simply because he was too exhausted to stay awake .',
 'Each day he found himself thinking less often of Ann ; ;',
 'each day the hurt was a little duller , a little less poignant .',
 'He had plenty of work to do .',
 'Bec

In [70]:
from string import punctuation
table = str.maketrans({key:None for key in punctuation})
cleaned_adventure = [sent.translate(table) for sent in joined_adventures]
cleaned_lore = [sent.translate(table) for sent in joined_lore]
cleaned_mystery = [sent.translate(table) for sent in joined_mystery]
cleaned_romance = [sent.translate(table) for sent in joined_romance]
cleaned_science_fiction = [sent.translate(table) for sent in joined_science_fiction]


In [98]:
cleaned_adventure[:10]

['Dan Morgan told himself he would forget Ann Turner ',
 'He was well rid of her ',
 'He certainly didnt want a wife who was fickle as Ann ',
 'If he had married her  hed have been asking for trouble ',
 'But all of this was rationalization ',
 'Sometimes he woke up in the middle of the night thinking of Ann  and then could not get back to sleep ',
 'His plans and dreams had revolved around her so much and for so long that now he felt as if he had nothing ',
 'The easiest thing would be to sell out to Al Budd and leave the country  but there was a stubborn streak in him that wouldnt allow it ',
 'The best antidote for the bitterness and disappointment that poisoned him was hard work ',
 'He found that if he was tired enough at night  he went to sleep simply because he was too exhausted to stay awake ']

In [48]:
def text_cleaner(text):
    # Visual inspection identifies a form of punctuation spaCy does not
    # recognize: the double dash '--'.  Better get rid of it now!
    text = str(text)
    text = ' '.join(text.split())
    text = re.sub('[\',]','', text)
    text = re.sub("[\[]*?![\]]", "", text)
    
    return text
    
#clean_adventure = text_cleaner(raw_adventure)
#clean_lore = text_cleaner(raw_lore)
#clean_mystery = text_cleaner(raw_mystery)
#clean_romance = text_cleaner(raw_romance)
#clean_science_fiction = text_cleaner(raw_science_fiction)


In [80]:
nlp = spacy.load('en')

adventure_doc = nlp(str(joined_adventures))
#lore_doc = nlp(raw_lore)
#mystery_doc = nlp(raw_mystery)
#romance_doc = nlp(raw_romance)
#science_fiction_doc = nlp(raw_science_fiction)

In [86]:
adventure_doc[:100]

['Dan Morgan told himself he would forget Ann Turner .', 'He was well rid of her .', "He certainly didn't want a wife who was fickle as Ann .", "If he had married her , he'd have been asking for trouble .", 'But all of this was rationalization .', 'Sometimes he woke up in the middle of the night thinking of Ann , and then could not get back to sleep .', 'His plans and dreams had revolved

In [83]:
adventure_sents = [[sent, "adventure"] for sent in joined_adventures]
#lore_sents = [[sent, "lore"] for sent in lore_doc]
#mystery_sents = [[sent, "mystery"] for sent in mystery_doc]
#romance_sents = [[sent, "romance"] for sent in romance_doc]
#science_fiction_sents = [[sent, "science_fiction"] for sent in science_fiction_doc]

sentences = pd.DataFrame(adventure_sents)

sentences.head()

Unnamed: 0,0,1
0,Dan Morgan told himself he would forget Ann Tu...,adventure
1,He was well rid of her .,adventure
2,He certainly didn't want a wife who was fickle...,adventure
3,"If he had married her , he'd have been asking ...",adventure
4,But all of this was rationalization .,adventure


In [None]:
from collections import Counter

In [117]:
# Utility function to create a list of the 3000 most common words.
# upped word cound from 2000 to 3000 because we aren't comparing works from
# two different authors but from 5 different genres.

punctuation = [',', '.', '\'', '\"']

def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = [word
                for word in text
                if not punctuation]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(3000)]
    

def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and not token.is_stop
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 500 == 0:
            print("Processing row {}".format(i))
            
    return df


In [122]:
adventure_words = bag_of_words(joined_adventures)
adventure_words

[]

In [11]:
adventure_text = brown.words(categories='adventure')
lore_text = brown.words(categories='lore')
mystery_text = brown.words(categories='mystery')
romance_text = brown.words(categories='romance')
science_fiction_text = brown.words(categories='science_fiction')

common_words = set(adventure_text + lore_text + mystery_text + romance_text + science_fiction_text)

In [85]:
len(common_words)

24582

In [84]:
word_counts = bow_features(sentences, common_words)

AttributeError: 'str' object has no attribute 'is_punct'

In [None]:
# set up pipeline pieces here
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

steps = []


In [None]:
help(nltk.tokenize)

In [None]:
# set up pipeline here
from sklearn.pipeline import Pipeline


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
#(change this line) lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:5])




In [None]:
similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_train).iloc[0:10,0:10]
#Making a plot
ax = sns.heatmap(sim_matrix,yticklabels=range(10))
plt.show()

#Generating a key for the plot.
print('Key:')
for i in range(10):
    print(i,sim_matrix.index[i])
