In [None]:
import numpy as np 
import pandas as pd 

# Plotly imports
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import spacy

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Other imports
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from matplotlib import pyplot as plt
%matplotlib inline

from wordcloud import WordCloud, STOPWORDS

In [None]:
train = pd.read_csv('/kaggle/input/spooky-author-identification/train.zip')

In [None]:
train.head()

There are 3 authors - EAP(edgar alan poe), HPL(HP Lovecraft), MWS(Mary Shelley)

In [None]:
print(train.shape)

In [None]:
n = {'EAP': 'Edgar Alan Poe', 'HPL':'HP Lovecraft', 'MWS':'Mary SHelley'}

data = [go.Bar(
            x = train.author.map(n).unique(),
            y = train.author.value_counts().values,
            marker = dict(colorscale = 'darkmint',
                         color = train.author.value_counts().values
                        ),
            text ='Texts per author'
    )]

layout = go.Layout(
    title = 'Distribution of target variable'
)

fig = go.Figure(data = data, layout = layout)

py.iplot(fig, filename='basic-bar')

In [None]:
words = train['text'].str.split(expand = True).unstack().value_counts()

data = [go.Bar(
            x = words.index.values[2:80],
            y = words.values[2:80],
            marker = dict(colorscale = 'darkmint',
                         color = words.values[2:160]
                         ),
            text = 'Word counts'
    )]

layout = go.Layout(
    title='Top 80 Words'
)

fig = go.Figure(data=data, layout = layout)

py.iplot(fig, filename='basic-bar')

WordClouds

In [None]:
eap = train[train.author=='EAP']['text'].values
hpl = train[train['author']=='HPL']['text'].values
mws = train[train['author']=='MWS']['text'].values

In [None]:
plt.figure(figsize = (16, 13))
wc = WordCloud(background_color = 'black', max_words = 10000,
              stopwords=STOPWORDS, max_font_size=40)
wc.generate(' '.join(hpl))
plt.title('HP Lovecraft word cloud', fontsize=20)
plt.imshow(wc.recolor( colormap= 'Pastel2' , random_state=17), alpha=0.98)

In [None]:
plt.figure(figsize = (16, 13))
wc = WordCloud(background_color = 'black', max_words = 10000,
              stopwords=STOPWORDS, max_font_size=40)
wc.generate(' '.join(eap))
plt.title('E A Poe word cloud')
plt.imshow(wc.recolor( colormap= 'Pastel2' , random_state=17), alpha=0.98)

In [None]:
plt.figure(figsize = (16, 13))
wc = WordCloud(background_color = 'black', max_words = 10000,
              stopwords=STOPWORDS, max_font_size=40)
wc.generate(' '.join(mws))
plt.title('M Sherry word cloud')
plt.imshow(wc.recolor( colormap= 'Pastel2' , random_state=17), alpha=0.98)
plt.axis('off')

**PREPROCESSING**
* Tokenization
* Stopwords
* Stemming
* Vectorization

In [None]:
import nltk

TOKENIZATION

In [None]:
first_text = train.text.values[0]

In [None]:
first_text_token = nltk.word_tokenize(first_text)
print(first_text_token)
print('\nNumber of words: {}'.format(len(first_text_token)))

In [None]:
nlp = spacy.load('en_core_web_sm')
doc = nlp('My new phone is made by Apple. Apple is company from San Francisco')
spacy.displacy.render(doc, style = 'ent')

STOPWORDS REMOVAL

In [None]:
stopwords = nltk.corpus.stopwords.words('english')
len(stopwords)

In [None]:
first_text_clean = [word for word in first_text_token if word.lower() not in stopwords]
print(first_text_clean)
print('\nNumber of words: {}'.format(len(first_text_clean)))

STEMMING AND LEMMATIZATION


In [None]:
stemmer = nltk.stem.PorterStemmer()

In [None]:
print(stemmer.stem('working'))
print(stemmer.stem('leaves'))

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize('leaves'))

**Vectorazing Raw Text**

The Bag of Words

In [None]:
text_test = ['I like trading stocks', 'I like reading books']
vectorizer = CountVectorizer(min_df=0)
text_transform = vectorizer.fit_transform(text_test)

In [None]:
vectorizer.get_feature_names()

In [None]:
text_transform.toarray()

In [None]:
text_transform

**TOPIC modelling**
* Latent Dirichlet Allocation 
* Non-negative Matrix Factorization

In [None]:
def print_top_words(model, feature_names, n_top_words):
    for index, topic in enumerate(model.components_):
        message = "\nTopic #{}:".format(index)
        message += " ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1 :-1]])
        print(message)
        print("="*70)                    

In [None]:
lemm = WordNetLemmatizer()
class LemmaCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(LemmaCountVectorizer, self).build_analyzer()
        return lambda doc: (lemm.lemmatize(w) for w in analyzer(doc))

In [None]:
# Storing the entire training text in a list
text = list(train.text.values)
# Calling our overwritten Count vectorizer
tf_vectorizer = LemmaCountVectorizer(max_df=0.95, 
                                     min_df=2,
                                     stop_words='english',
                                     decode_error='ignore')
tf = tf_vectorizer.fit_transform(text)

In [None]:
feature_names = tf_vectorizer.get_feature_names()
count_vec = np.asarray(tf.sum(axis=0)).ravel()
zipped = list(zip(feature_names, count_vec))
x, y = (list(x) for x in zip(*sorted(zipped, key=lambda x: x[1], reverse=True)))
# Now I want to extract out on the top 15 and bottom 15 words
Y = np.concatenate([y[0:15], y[-16:-1]])
X = np.concatenate([x[0:15], x[-16:-1]])

# Plotting the Plot.ly plot for the Top 50 word frequencies
data = [go.Bar(
            x = x[0:50],
            y = y[0:50],
            marker= dict(colorscale='Jet',
                         color = y[0:50]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Top 50 Word frequencies after Preprocessing'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

# Plotting the Plot.ly plot for the Top 50 word frequencies
data = [go.Bar(
            x = x[-100:],
            y = y[-100:],
            marker= dict(colorscale='Portland',
                         color = y[-100:]
                        ),
            text='Word counts'
    )]

layout = go.Layout(
    title='Bottom 100 Word frequencies after Preprocessing'
)

fig = go.Figure(data=data, layout=layout)

py.iplot(fig, filename='basic-bar')

In [None]:
lda = LatentDirichletAllocation(n_components=11, max_iter=5,
                                learning_method = 'online',
                                learning_offset = 50.,
                                random_state = 0)

In [None]:
lda.fit(tf)

In [None]:
n_top_words = 40
print("\nTopics in LDA model: ")
tf_feature_names = tf_vectorizer.get_feature_names()
print_top_words(lda, tf_feature_names, n_top_words)

In [None]:
first_topic = lda.components_[0]
second_topic = lda.components_[1]
third_topic = lda.components_[2]
fourth_topic = lda.components_[3]

Word Cloud visualizations of the topics¶


In [None]:
first_topic_words = [tf_feature_names[i] for i in first_topic.argsort()[:-50 - 1 :-1]]
second_topic_words = [tf_feature_names[i] for i in second_topic.argsort()[:-50 - 1 :-1]]
third_topic_words = [tf_feature_names[i] for i in third_topic.argsort()[:-50 - 1 :-1]]
fourth_topic_words = [tf_feature_names[i] for i in fourth_topic.argsort()[:-50 - 1 :-1]]

In [None]:
firstcloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          width=2500,
                          height=1800
                         ).generate(" ".join(first_topic_words))
plt.imshow(firstcloud)
plt.axis('off')
plt.show()

In [None]:
cloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          width=2500,
                          height=1800
                         ).generate(" ".join(second_topic_words))
plt.imshow(cloud)
plt.axis('off')
plt.show()

In [None]:
cloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          width=2500,
                          height=1800
                         ).generate(" ".join(third_topic_words))
plt.imshow(cloud)
plt.axis('off')
plt.show()

In [None]:
cloud = WordCloud(
                          stopwords=STOPWORDS,
                          background_color='black',
                          width=2500,
                          height=1800
                         ).generate(" ".join(fourth_topic_words))
plt.imshow(cloud)
plt.axis('off')
plt.show()

extending the countvectorizer class with a lemmatizer