In [5]:
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import re
import os
import string
import pickle
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# To download necessary resources:
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to
[nltk_data]     /home/przemyslaw/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/przemyslaw/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/przemyslaw/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [6]:
posts = pd.read_csv(os.path.join('..', 'data', 'writers.stackexchange.com', 'Posts.csv'))

In [7]:
def clean_html(text):
    cleanr = re.compile('<.*?>')
    text = re.sub(cleanr, '', text)
    text = text.replace("\n", " ").replace("\t", " ").replace("\\", "")
    return text

def clean_text(text):
    text = clean_html(text)
    text = text.lower()
    
    word_list = word_tokenize(text)
    word_list = [word.strip() for word in word_list if not all(letter in string.punctuation for letter in word)]
    word_list = ["<NUM>" if word.isdigit() else word for word in word_list]
    
    return " ".join(word_list)

In [8]:
posts = posts[posts.Body.notna()]
texts = posts.Body
texts = [clean_text(body) for body in texts]  # takes a few minuts to complete
texts[:2]

["i 've always wanted to start writing in a totally amateur way but whenever i want to start something i instantly get blocked having a lot of questions and doubts are there some resources on how to start becoming a writer i 'm thinking something with tips and easy exercises to get the ball rolling",
 'what kind of story is better suited for each point of view are there advantages or disadvantages inherent to them for example writing in the first person you are always following a character while in the third person you can jump between story lines']

In [9]:
posts.Body = [clean_html(body) for body in posts.Body]

In [10]:
cv = CountVectorizer(max_df=0.3, min_df=0.01, stop_words=list(stopwords.words('english')))
cv_X = cv.fit_transform(texts)
cv_X

<32130x1318 sparse matrix of type '<class 'numpy.int64'>'
	with 1462719 stored elements in Compressed Sparse Row format>

Jak czytamy w dokumentacji (http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

Convert a collection of text documents to a matrix of token counts
This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.

Przyjmuje ciąg stringów i metodą fit_transform zwraca macierz wystąpień poszczególnych tokenów.

max_df - część najczęściej występujących wyrazów, które chcemy wyciąć

min_df - część najrzadziej występujących wyrazów, które chcemy wyciąć

stopwords - lista stopwordów, które chcemy ignorować

In [11]:
counts = pd.DataFrame({
    'word': cv.get_feature_names(),
    'count': cv_X.toarray().sum(axis=0)
})
counts.nlargest(10, columns='count')

Unnamed: 0,word,count
161,character,20201
774,num,16017
1305,write,14389
123,book,14142
1261,want,13800
674,make,13789
1240,use,13323
827,people,13224
162,characters,12629
924,reader,12404


## LDA
Latent Dirichlet Allocation with online variational Bayes algorithm (http://scikit-learn.org/stable/modules/generated/sklearn.decomposition.LatentDirichletAllocation.html)

Metoda ta służy do określania tematów dokumentów tekstowych w nienadzorowany sposób. Określamy na ile tematów chcemy podzielić nasz zbiór tesktów, a algorytm wykrywa te tematy poprzez podanie listy słów do niego pasujących. Więcej można poczytać o nim na Wikipedii: https://en.wikipedia.org/wiki/Latent_Dirichlet_allocation

n_componenets - liczba tematów, które chcemy przyporządkować danym

max_iter - maksymalna liczba iteracji (żeby się za długo nie liczyło)

learning_method - 'batch' lub 'online' (używane tylko w metodzie fit). Online jest szybsza, batch używa wszystkich danych treningowych w każdej iteracji EM do zaktualizowania components_.

Na obiekcie lda po wywołaniu metody fit będziemy mogli zobaczyć lda.components_, czyli tablicę wymiaru \[n_components, n_features\], gdzie element (i,j) reprezentuje jak często słowo j jest przypisywane do tematu i.

In [12]:
n_components = 3
lda = LatentDirichletAllocation(n_components = n_components, max_iter=5, learning_method='online', random_state=123)
lda.fit(cv_X)

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='online', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=5, mean_change_tol=0.001,
             n_components=3, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=123, topic_word_prior=None,
             total_samples=1000000.0, verbose=0)

In [13]:
def print_top_words(model, feature_names, n_top_words):
    for topic_idx, topic in enumerate(model.components_):
        message = 'Topic #%d: ' % topic_idx
        message += ' '.join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]])
        print(message + "\n")
    print()

In [14]:
print_top_words(lda, cv.get_feature_names(), 50)

Topic #0: write book work get read people good want time know find make books num think first something need way even writer go much many also well may writers really reading try lot might novel idea could ideas things better best written start take fiction going help say see someone published

Topic #1: character characters reader make way something people think person world could know want even first plot time things scene example good might need point much life readers also different may protagonist main two see get say real feel going really show stories well end tell without another thing hero said

Topic #2: num use name word using text words used example style page also book different sentence information source author may question chapter copyright work technical two language case paragraph need could names title first want number way might version software many etc questions specific pages line document original reference content article




Mozna poeksperymentować z liczbą topiców. Dla n_components=3 widzimy, że zerowy temat dotyczy głównie ludzi, którzy chcą zająć się pisaniem i publikować swoje opowieści, pierwszy dotyczy bardziej książek i ich treści natomiast drugi jest bardziej techniczny. 

Możemy też podejrzeć posty należące do tych tematów.

In [15]:
lda_features = lda.transform(cv_X[:10, ])
lda_features

array([[0.96725055, 0.01685198, 0.01589747],
       [0.02627304, 0.94285227, 0.03087469],
       [0.90816841, 0.04653362, 0.04529797],
       [0.35094893, 0.63207076, 0.01698031],
       [0.93456541, 0.03324022, 0.03219437],
       [0.40615141, 0.04473548, 0.54911311],
       [0.44400595, 0.18601468, 0.36997937],
       [0.13195231, 0.858124  , 0.00992368],
       [0.84231466, 0.08622382, 0.07146152],
       [0.7150574 , 0.14220697, 0.14273563]])

In [16]:
topics = np.argmax(lda_features, axis=1)
posts_topics = pd.DataFrame({"posts": texts[:10], "topics":topics})

In [17]:
lda_features = lda.transform(cv_X)
topics = np.argmax(lda_features, axis=1)

In [18]:
posts["topic0"], posts["topic1"], posts["topic2"] = zip(*lda_features)
posts["topic"] = topics

In [19]:
posts.head()

Unnamed: 0.1,Unnamed: 0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,...,ParentId,PostTypeId,Score,Tags,Title,ViewCount,topic0,topic1,topic2,topic
0,0,,8.0,I've always wanted to start writing (in a tota...,,6,,2010-11-18T20:40:32.857,17.0,1,...,,1,32,<resources>,What are some online guides for starting writers?,1230.0,0.967251,0.016852,0.015897,0
1,1,16.0,7.0,What kind of story is better suited for each p...,,0,,2010-11-18T20:42:31.513,5.0,2,...,,1,20,<fiction><grammatical-person><third-person>,What is the difference between writing in the ...,8298.0,0.026273,0.942852,0.030875,1
2,2,31.0,5.0,"I finished my novel, and everyone I've talked ...",,1,,2010-11-18T20:43:28.903,10.0,3,...,,1,34,<publishing><novel><agent>,How do I find an agent?,691.0,0.908168,0.046534,0.045298,0
3,3,,7.0,When writing a short-story to highlight a cert...,,0,,2010-11-18T20:43:59.693,4.0,5,...,,1,28,<plot><short-story><planning><brainstorming>,Decide on a theme/overarching meaning before w...,3043.0,0.350949,0.632071,0.01698,1
4,4,85.0,10.0,"I keep hearing about literary fiction, and how...",,1,,2010-11-18T20:45:44.067,5.0,7,...,,1,19,<fiction><genre><categories>,What is Literary Fiction?,769.0,0.934565,0.03324,0.032194,0


## Saving useful variables

In [20]:
lda_path = os.path.join("..", "preprocessed_data", "lda")
posts.to_csv(os.path.join(lda_path, "posts_topics_scifi.csv"))

In [None]:
model_path = os.path.join(lda_path, "lda_3components.pkl")
with open(model_path, "wb") as f:
    pickle.dump([lda, cv], f)

In [35]:
# Test to see if it works:

with open(model_path, 'rb') as f:
    lda, cv = pickle.load(f)
    
print_top_words(lda, cv.get_feature_names(), 50)

Topic #0: write book work get read people good want time know find make books num think first something need way even writer go much many also well may writers really reading try lot might novel idea could ideas things better best written start take fiction going help say see someone published

Topic #1: character characters reader make way something people think person world could know want even first plot time things scene example good might need point much life readers also different may protagonist main two see get say real feel going really show stories well end tell without another thing hero said

Topic #2: num use name word using text words used example style page also book different sentence information source author may question chapter copyright work technical two language case paragraph need could names title first want number way might version software many etc questions specific pages line document original reference content article




In [36]:
posts.head()

Unnamed: 0.1,Unnamed: 0,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,Id,...,ParentId,PostTypeId,Score,Tags,Title,ViewCount,topic0,topic1,topic2,topic
0,0,,8.0,I've always wanted to start writing (in a tota...,,6,,2010-11-18T20:40:32.857,17.0,1,...,,1,32,<resources>,What are some online guides for starting writers?,1230.0,0.967251,0.016852,0.015897,0
1,1,16.0,7.0,What kind of story is better suited for each p...,,0,,2010-11-18T20:42:31.513,5.0,2,...,,1,20,<fiction><grammatical-person><third-person>,What is the difference between writing in the ...,8298.0,0.026273,0.942852,0.030875,1
2,2,31.0,5.0,"I finished my novel, and everyone I've talked ...",,1,,2010-11-18T20:43:28.903,10.0,3,...,,1,34,<publishing><novel><agent>,How do I find an agent?,691.0,0.908168,0.046534,0.045298,0
3,3,,7.0,When writing a short-story to highlight a cert...,,0,,2010-11-18T20:43:59.693,4.0,5,...,,1,28,<plot><short-story><planning><brainstorming>,Decide on a theme/overarching meaning before w...,3043.0,0.350949,0.632071,0.01698,1
4,4,85.0,10.0,"I keep hearing about literary fiction, and how...",,1,,2010-11-18T20:45:44.067,5.0,7,...,,1,19,<fiction><genre><categories>,What is Literary Fiction?,769.0,0.934565,0.03324,0.032194,0


In [28]:
lda_path = os.path.join("..", "preprocessed_data", "lda")
new_posts = pd.read_csv(os.path.join(lda_path, "posts_topics_scifi.csv"), engine="python")

In [29]:
new_posts.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,AcceptedAnswerId,AnswerCount,Body,ClosedDate,CommentCount,CommunityOwnedDate,CreationDate,FavoriteCount,...,ParentId,PostTypeId,Score,Tags,Title,ViewCount,topic0,topic1,topic2,topic
0,0,0.0,,8.0,I've always wanted to start writing (in a tota...,,6.0,,2010-11-18T20:40:32.857,17.0,...,,1.0,32.0,<resources>,What are some online guides for starting writers?,1230.0,0.967251,0.016852,0.015897,0.0
1,1,1.0,16.0,7.0,What kind of story is better suited for each p...,,0.0,,2010-11-18T20:42:31.513,5.0,...,,1.0,20.0,<fiction><grammatical-person><third-person>,What is the difference between writing in the ...,8298.0,0.026273,0.942852,0.030875,1.0
2,2,2.0,31.0,5.0,"I finished my novel, and everyone I've talked ...",,1.0,,2010-11-18T20:43:28.903,10.0,...,,1.0,34.0,<publishing><novel><agent>,How do I find an agent?,691.0,0.908168,0.046534,0.045298,0.0
3,3,3.0,,7.0,When writing a short-story to highlight a cert...,,0.0,,2010-11-18T20:43:59.693,4.0,...,,1.0,28.0,<plot><short-story><planning><brainstorming>,Decide on a theme/overarching meaning before w...,3043.0,0.350949,0.632071,0.01698,1.0
4,4,4.0,85.0,10.0,"I keep hearing about literary fiction, and how...",,1.0,,2010-11-18T20:45:44.067,5.0,...,,1.0,19.0,<fiction><genre><categories>,What is Literary Fiction?,769.0,0.934565,0.03324,0.032194,0.0
