In [117]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import brown, stopwords

In [2]:
# import corpus here
# nltk.download() 

In [3]:
print(brown.categories())


['adventure', 'belles_lettres', 'editorial', 'fiction', 'government', 'hobbies', 'humor', 'learned', 'lore', 'mystery', 'news', 'religion', 'reviews', 'romance', 'science_fiction']


## I chose to only use 5 categories in my work here because I did not want overload my kernel as I am on a MacBookAir and do not have unlimited space in memory. 

In [50]:
raw_adventure = brown.sents(categories='adventure')
raw_lore = brown.sents(categories='lore')
raw_mystery = brown.sents(categories='mystery')
raw_romance = brown.sents(categories='romance')
raw_science_fiction = brown.sents(categories='science_fiction')



In [65]:
print(len(raw_adventure))
print(len(raw_lore))
print(len(raw_mystery))
print(len(raw_romance))
print(len(raw_science_fiction))

4637
4881
3886
4431
948


In [88]:
joined_adventure = [" ".join(sent) for sent in raw_adventure]
joined_lore = [" ".join(sent) for sent in raw_lore]
joined_mystery = [" ".join(sent) for sent in raw_mystery]
joined_romance = [" ".join(sent) for sent in raw_romance]


In [68]:
from string import punctuation
table = str.maketrans({key:None for key in punctuation})
cleaned_adventure = [sent.translate(table) for sent in joined_adventures]
cleaned_lore = [sent.translate(table) for sent in joined_lore]
cleaned_mystery = [sent.translate(table) for sent in joined_mystery]
cleaned_romance = [sent.translate(table) for sent in joined_romance]


In [92]:
def text_cleaner(text):
    text = str(text)
    text = text.lower()
    text = ' '.join(text.split())
    
    
    return text
    
clean_adventure = text_cleaner(cleaned_adventure)
#clean_lore = text_cleaner(raw_lore)
#clean_mystery = text_cleaner(raw_mystery)
#clean_romance = text_cleaner(raw_romance)


In [107]:
cleaned_adventure[0]

'Dan Morgan told himself he would forget Ann Turner '

In [137]:
adventure_sents = [[sent, "adventure"] for sent in cleaned_adventure]
lore_sents = [[sent, "lore"] for sent in cleaned_lore]
mystery_sents = [[sent, "mystery"] for sent in cleaned_mystery]
romance_sents = [[sent, "romance"] for sent in cleaned_romance]

sentences = pd.DataFrame(adventure_sents +
                         lore_sents +
                         mystery_sents +
                         romance_sents)
sentences.head()

Unnamed: 0,0,1
0,Dan Morgan told himself he would forget Ann Tu...,adventure
1,He was well rid of her,adventure
2,He certainly didnt want a wife who was fickle ...,adventure
3,If he had married her hed have been asking fo...,adventure
4,But all of this was rationalization,adventure


In [11]:
nlp = spacy.load('en')

In [83]:
from collections import Counter

In [112]:
# Utility function to create a list of the 3000 most common words.
# changed word cound from 2000 to 200 because we aren't comparing works from
# two different authors but from 5 different genres.

def bag_of_words(text):
    
    # make text string, pull out each word
    text = str(text)
    allwords = text.split()
    
    # Return the most common words.
    return [item[0] for item in Counter(allwords).most_common(200)]
    

def bow_features(sentences, common_words):
    
    # Scaffold the data frame and initialize counts to zero.
    df = pd.DataFrame(columns=common_words)
    df['text_sentence'] = sentences[0]
    df['text_source'] = sentences[1]
    df.loc[:, common_words] = 0
    
    # Process each row, counting the occurrence of words in each sentence.
    for i, sentence in enumerate(df['text_sentence']):
        
        # Convert the sentence to lemmas, then filter out punctuation,
        # stop words, and uncommon words.
        sentence = nlp(sentence)
        words = [token.lemma_
                 for token in sentence
                 if (
                     not token.is_punct
                     and token.lemma_ in common_words
                 )]
        
        # Populate the row with word counts.
        for word in words:
            df.loc[i, word] += 1
        
        # This counter is just to make sure the kernel didn't hang.
        if i % 1000 == 0:
            print("Processing row {}".format(i))
            
    return df


In [99]:
adventure_words = bag_of_words(cleaned_adventure)
print(len(adventure_words))


200


In [None]:
test_clean = bag_of_words()

In [139]:
lore_words = bag_of_words(cleaned_lore)
mystery_words = bag_of_words(cleaned_mystery)
romance_words = bag_of_words(cleaned_romance)

common_words = set(adventure_words +
                   lore_words +
                   mystery_words +
                   romance_words)

len(common_words)

296

In [113]:
word_counts = bow_features(sentences, common_words)

Processing row 0
Processing row 1000
Processing row 2000
Processing row 3000
Processing row 4000
Processing row 5000
Processing row 6000
Processing row 7000
Processing row 8000
Processing row 9000
Processing row 10000
Processing row 11000
Processing row 12000
Processing row 13000
Processing row 14000
Processing row 15000
Processing row 16000
Processing row 17000
Processing row 18000
Processing row 19000
Processing row 20000
Processing row 21000
Processing row 22000


# make tf-idf df

In [149]:
actually_raw_adventure = brown.raw(categories='adventure')

In [150]:
actually_raw_lore = brown.raw(categories='lore')
actually_raw_mystery = brown.raw(categories='mystery')
actually_raw_romance = brown.raw(categories='romance')



In [151]:
actually_raw_adventure = actually_raw_adventure.split('\n\n\n\t')
actually_raw_lore = actually_raw_lore.split('\n\n\n\t')
actually_raw_mystery = actually_raw_mystery.split('\n\n\n\t')
actually_raw_romance = actually_raw_romance.split('\n\n\n\t')

In [189]:
# join separeted paragraphs here
print("adventure length: ", len(actually_raw_adventure))
print("lore length: ", len(actually_raw_lore))
print("mystery length: ", len(actually_raw_mystery))
print("romance length: ", len(actually_raw_romance))


adventure length:  1307
lore length:  1042
mystery length:  1080
romance length:  1154


In [192]:
print("adventure length: ", len(actually_raw_adventure))
combined = actually_raw_adventure + actually_raw_lore
print("adventure + lore length: ", len(combined))
combined = combined + actually_raw_mystery
print("with mystery length: ", len(combined))
combined = combined + actually_raw_romance
print("with romance length: ", len(combined))


adventure length:  1307
adventure + lore length:  2349
with mystery length:  3429
with romance length:  4583


In [207]:
actually_raw_adventure[0]

"\n\n\tDan/np Morgan/np told/vbd himself/ppl he/pps would/md forget/vb Ann/np Turner/np ./.\nHe/pps was/bedz well/rb rid/jj of/in her/ppo ./.\nHe/pps certainly/rb didn't/dod* want/vb a/at wife/nn who/wps was/bedz fickle/jj as/cs Ann/np ./.\nIf/cs he/pps had/hvd married/vbn her/ppo ,/, he'd/pps+md have/hv been/ben asking/vbg for/in trouble/nn ./."

In [208]:
combined[0]

"\n\n\tDan/np Morgan/np told/vbd himself/ppl he/pps would/md forget/vb Ann/np Turner/np ./.\nHe/pps was/bedz well/rb rid/jj of/in her/ppo ./.\nHe/pps certainly/rb didn't/dod* want/vb a/at wife/nn who/wps was/bedz fickle/jj as/cs Ann/np ./.\nIf/cs he/pps had/hvd married/vbn her/ppo ,/, he'd/pps+md have/hv been/ben asking/vbg for/in trouble/nn ./."

In [199]:
len(combined)

4583

In [216]:
combined[:4]

["\n\n\tDan/np Morgan/np told/vbd himself/ppl he/pps would/md forget/vb Ann/np Turner/np ./.\nHe/pps was/bedz well/rb rid/jj of/in her/ppo ./.\nHe/pps certainly/rb didn't/dod* want/vb a/at wife/nn who/wps was/bedz fickle/jj as/cs Ann/np ./.\nIf/cs he/pps had/hvd married/vbn her/ppo ,/, he'd/pps+md have/hv been/ben asking/vbg for/in trouble/nn ./.", "But/cc all/abn of/in this/dt was/bedz rationalization/nn ./.\nSometimes/rb he/pps woke/vbd up/rp in/in the/at middle/nn of/in the/at night/nn thinking/vbg of/in Ann/np ,/, and/cc then/rb could/md not/* get/vb back/rb to/in sleep/nn ./.\nHis/pp$ plans/nns and/cc dreams/nns had/hvd revolved/vbn around/in her/ppo so/ql much/rb and/cc for/in so/ql long/jj that/cs now/rb he/pps felt/vbd as/cs if/cs he/pps had/hvd nothing/pn ./.\nThe/at easiest/jjt thing/nn would/md be/be to/to sell/vb out/rp to/in Al/np Budd/np and/cc leave/vb the/at country/nn ,/, but/cc there/ex was/bedz a/at stubborn/jj streak/nn in/in him/ppo that/dt wouldn't/md* allow/vb it

In [217]:
combined = combined.split(',')

AttributeError: 'list' object has no attribute 'split'

In [202]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

X_train, X_test = train_test_split(actually_raw_adventure, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.75, # drop words that occur in more than half the paragraphs
                             min_df=2, # only use words that appear at least twice, 
                             lowercase=False, #convert everything to lower case 
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies. Prevents divide-by-zero errors
                            )


#Applying the vectorizer
brown_sents_tfidf=vectorizer.fit_transform(actually_raw_adventure)
print("Number of features: %d" % brown_sents_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(brown_sents_tfidf, test_size=0.4)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of paragraphs
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per paragraph
tfidf_bypara = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each paragraph, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):
    tfidf_bypara[i][terms[j]] = X_train_tfidf_csr[i, j]

#Keep in mind that the log base 2 of 1 is 0, so a tf-idf score of 0 indicates that the word was present once in that sentence.
print('Original sentence:', X_train[5])
print('Tf_idf vector:', tfidf_bypara[5])

TypeError: expected string or bytes-like object

what i tried:
- sentences - cannot use a string pattern on a bytes-like object
- str(sentences) - cannot use a string pattern on a bytes-like object
- cleaned_adventure - cannot use a string pattern on a bytes-like object
- clean_adventure - cannot use a string pattern on a bytes-like object
- raw_adventure - cannot use a string pattern on a bytes-like object
- actually_raw_adventure - Iterable over raw text documents expected, string object received.
- a;/ih/IHAWFE .NAS .Iwhue ;/SDNLV/lans

## modeling section

In [None]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 1379 to 130.
svd= TruncatedSVD(130)
#(change this line) lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(5):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:5])




In [None]:
similarity = np.asarray(np.asmatrix(X_train_lsa) * np.asmatrix(X_train_lsa).T)
#Only taking the first 10 sentences
sim_matrix=pd.DataFrame(similarity,index=X_train).iloc[0:10,0:10]
#Making a plot
ax = sns.heatmap(sim_matrix,yticklabels=range(10))
plt.show()

#Generating a key for the plot.
print('Key:')
for i in range(10):
    print(i,sim_matrix.index[i])


In [131]:
from sklearn import ensemble
from sklearn.model_selection import train_test_split

rfc = ensemble.RandomForestClassifier()
Y = word_counts['text_source']
X = np.array(word_counts.drop(['text_sentence','text_source'], 1))

X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y,
                                                    test_size=0.4)
train = rfc.fit(X_train, y_train)

print('Training set score:', rfc.score(X_train, y_train))
print('\nTest set score:', rfc.score(X_test, y_test))

Training set score: 0.888

Test set score: 0.29431741000679196


In [132]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
train = lr.fit(X_train, y_train)
print(X_train.shape, y_train.shape)
print('Training set score:', lr.score(X_train, y_train))
print('\nTest set score:', lr.score(X_test, y_test))

(13250, 304) (13250,)
Training set score: 0.3772075471698113

Test set score: 0.34480416572334166


In [None]:
#help(nltk.tokenize)