In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import brown, stopwords

In [2]:
# import corpus here
# nltk.download() 

In [3]:
print(brown.categories())


['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


## I chose to only use 5 categories in my work here because I did not want overload my kernel as I am on a MacBookAir and do not have unlimited space in memory. 

In [50]:
raw_adventure = brown.sents(categories='adventure')
raw_lore = brown.sents(categories='lore')
raw_mystery = brown.sents(categories='mystery')
raw_romance = brown.sents(categories='romance')
raw_science_fiction = brown.sents(categories='science_fiction')



In [65]:
print(len(raw_adventure))
print(len(raw_lore))
print(len(raw_mystery))
print(len(raw_romance))
print(len(raw_science_fiction))

4637
4881
3886
4431
948


In [73]:
raw_fiction = brown.sents(categories='fiction')
len(raw_fiction)

4249

In [67]:
joined_adventures = [" ".join(sent) for sent in raw_adventure]
joined_lore = [" ".join(sent) for sent in raw_lore]
joined_mystery = [" ".join(sent) for sent in raw_mystery]
joined_romance = [" ".join(sent) for sent in raw_romance]
joined_fiction = [" ".join(sent) for sent in raw_fiction]

In [79]:
joined_fiction[10]

'They ate the cafeteria food with its orange sauces and Scotty gazed without interest at his food , the teachers , the heroic baronial windows , and the bright ranks of college banners .'

In [68]:
from string import punctuation
table = str.maketrans({key:None for key in punctuation})
cleaned_adventure = [sent.translate(table) for sent in joined_adventures]
cleaned_lore = [sent.translate(table) for sent in joined_lore]
cleaned_mystery = [sent.translate(table) for sent in joined_mystery]
cleaned_romance = [sent.translate(table) for sent in joined_romance]
cleaned_fiction = [sent.translate(table) for sent in joined_fiction]

In [69]:
cleaned_fiction[0]

'Thirtythree'

In [42]:
def text_cleaner(text):
    text = str(text)
    #text = text.lower()
    text = ' '.join(text.split())
    
    
    return text
    
clean_adventure = text_cleaner(cleaned_adventure)
#clean_lore = text_cleaner(raw_lore)
#clean_mystery = text_cleaner(raw_mystery)
#clean_romance = text_cleaner(raw_romance)
#clean_science_fiction = text_cleaner(raw_science_fiction)


In [58]:
cleaned_lore[0]

'In American romance  almost nothing rates higher than what the movie men have called  meeting cute   that is  boymeetsgirl seems more adorable if it doesnt take place in an atmosphere of correct and acute boredom '

In [11]:
nlp = spacy.load('en')

In [59]:
adventure_doc = nlp(str(cleaned_adventure))

In [60]:
lore_doc = nlp(str(cleaned_lore))

In [None]:
mystery_doc = nlp(str(cleaned_mystery))

In [None]:
romance_doc = nlp(str(cleaned_romance))

In [43]:
fiction_doc = nlp(str(cleaned_fiction))

KeyboardInterrupt: 

In [None]:
adventure_sents = [[sent, "adventure"] for sent in cleaned_adventure]
lore_sents = [[sent, "lore"] for sent in cleaned_lore]
mystery_sents = [[sent, "mystery"] for sent in cleaned_mystery]
romance_sents = [[sent, "romance"] for sent in cleaned_romance]
fiction_sents = [[sent, "fiction"] for sent in cleaned_fiction]


In [41]:
adventure_sents[0]

['Dan Morgan told himself he would forget Ann Turner ', 'adventure']

In [None]:
from collections import Counter

In [149]:
# Utility function to create a list of the 3000 most common words.
# changed word cound from 2000 to 200 because we aren't comparing works from
# two different authors but from 5 different genres.

def bag_of_words(text):
    
    # Filter out punctuation and stop words.
    allwords = text.split()
    #allwords = [word
    #            for word in text]
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(200)]
    

def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = [sent for (sent, label) in sentences]
    df['text_source'] = [label for (sent, label) in sentences]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        sentence = nlp(sentence)
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 1000 == 0:
            print("Processing row {}".format(i))
            
    return df


In [None]:
adventure_words = bag_of_words(adventure_doc)
len(adventure_words)

In [11]:
adventure_text = brown.words(categories='adventure')
lore_text = brown.words(categories='lore')
mystery_text = brown.words(categories='mystery')
romance_text = brown.words(categories='romance')
fiction_text = brown.words(categories='fiction')

common_words = set(adventure_text + lore_text + mystery_text + romance_text + science_fiction_text)

In [85]:
len(common_words)

24582

In [84]:
word_counts = bow_features(sentences, common_words)

AttributeError: 'str' object has no attribute 'is_punct'

In [None]:
# set up pipeline pieces here
from sklearn import ensemble
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

steps = []


In [None]:
#help(nltk.tokenize)

In [None]:
# set up pipeline here
from sklearn.pipeline import Pipeline


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(emma_paras, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case (since Alice in Wonderland has the HABIT of CAPITALIZING WORDS for EMPHASIS)
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
emma_paras_tfidf=vectorizer.fit_transform(emma_paras)
print("Number of features: %d" % emma_paras_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(emma_paras_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
#(change this line) lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:5])




In [None]:
similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_train).iloc[0:10,0:10]
#Making a plot
ax = sns.heatmap(sim_matrix,yticklabels=range(10))
plt.show()

#Generating a key for the plot.
print('Key:')
for i in range(10):
    print(i,sim_matrix.index[i])
