### Topic modelling on my data

#### load and preprocess the data

In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
import time
from tqdm.auto import tqdm

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # This is often necessary for the WordNet lemmatizer in newer NLTK versions.



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [19]:
# Test loading WordNet
print(wn.synsets('dog'))  # Should return a list of Synsets for "dog"

# Test loading Stopwords
print(stopwords.words('english')[:10])  # Print first 10 English stopwords

# Test loading Open Multilingual WordNet
#print(wn.__version())  # Check if OMW is loaded by printing the version or similar attribute

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [3]:
# Preprocessing 
def preprocess_text(text):
    try:
        tokenizer = RegexpTokenizer(r'\w+')
        lemmatizer = WordNetLemmatizer()
        tokens = tokenizer.tokenize(text.lower())  # Tokenize and convert to lower case
        cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]
        return " ".join(cleaned_tokens)
    except Exception as e:
        print(f"Error processing text: {text}")
        raise e

Unnamed: 0,Chunked_Content
241858,are wild animals safe to eat raw?
241859,I saw this on eat raw meat channel. I wouldn’t...
241860,You got vaxxed even after listening to aajonus...
241861,Is it possible for a detox to last 4 months or...
241862,What to do with 10L of raw milk that are alrea...


In [9]:
# import data
df = pd.read_csv('../data/clean_data/thread_chunked_conversations.csv')
df.tail()
# Apply preprocessing to your text column (assuming it's named 'Content')
t0=time.time()
df['processed_content'] = df['Chunked_Content'].apply(preprocess_text)
t1=time.time()
print((t1-t0)/60)

KeyboardInterrupt: 

In [None]:
df.to_csv('../data/clean_data/thread_chunked_conversations_processed.csv', index=False)

In [4]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel


In [16]:
# Prepare text for LDA analysis
texts = [text.split() for text in df['processed_content']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True)


In [17]:
# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.059*"juice" + 0.046*"water" + 0.040*"use" + 0.016*"celery" + 0.014*"mineral"')
(1, '0.044*"http" + 0.020*"com" + 0.017*"www" + 0.014*"lube" + 0.012*"youtu"')
(2, '0.019*"like" + 0.013*"good" + 0.013*"would" + 0.011*"think" + 0.011*"one"')
(3, '0.078*"raw" + 0.056*"meat" + 0.052*"eat" + 0.030*"diet" + 0.029*"food"')
(4, '0.023*"milk" + 0.015*"cheese" + 0.015*"day" + 0.014*"fat" + 0.012*"egg"')


- (0, '0.029*"one" + 0.024*"yes" + 0.019*"body" + 0.017*"year" + 0.016*"way"')
- (1, '0.036*"water" + 0.028*"1" + 0.028*"formula" + 0.023*"http" + 0.020*"coconut"')
- (2, '0.039*"like" + 0.023*"would" + 0.020*"think" + 0.018*"know" + 0.018*"people"')
- (3, '0.030*"raw" + 0.023*"get" + 0.022*"milk" + 0.020*"eat" + 0.020*"meat"')
- (4, '0.029*"juice" + 0.029*"aajonus" + 0.028*"diet" + 0.025*"primal" + 0.019*"use"')

#### Combined questions and answers and split long chunks

In [5]:
# import data
df = pd.read_csv('../data/clean_data/QandA_split_conversations_final.csv')
df.tail()
# Apply preprocessing to your text column (assuming it's named 'Content')
tqdm.pandas(desc='process text')
t0=time.time()
df['processed_content'] = df['Chunked_Content'].progress_apply(preprocess_text)
t1=time.time()
print((t1-t0)/60)

extract keywords:   0%|          | 0/49006 [00:00<?, ?it/s]

35.45704231262207


In [8]:
# Prepare text for LDA analysis
texts = [text.split() for text in df['processed_content']]
t0=time.time()
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in tqdm(texts)]

# LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True)
# Print the topics

topics = lda_model.print_topics(num_words=5)
for topic in tqdm(topics):
    print(topic)
t1=time.time()
total=t1-t0
print(total/60)

  0%|          | 0/49006 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

(0, '0.012*"like" + 0.012*"people" + 0.009*"know" + 0.008*"u" + 0.007*"think"')
(1, '0.044*"raw" + 0.033*"milk" + 0.025*"meat" + 0.022*"cheese" + 0.015*"butter"')
(2, '0.034*"water" + 0.026*"oil" + 0.024*"use" + 0.019*"clay" + 0.015*"coconut"')
(3, '0.060*"http" + 0.033*"com" + 0.026*"chat" + 0.024*"www" + 0.023*"primal"')
(4, '0.011*"eat" + 0.011*"like" + 0.010*"day" + 0.010*"fat" + 0.009*"juice"')
3.3264507253964744


In [7]:
(0, '0.012*"like" + 0.012*"people" + 0.009*"know" + 0.008*"u" + 0.007*"think"')
(1, '0.044*"raw" + 0.033*"milk" + 0.025*"meat" + 0.022*"cheese" + 0.015*"butter"')
(2, '0.034*"water" + 0.026*"oil" + 0.024*"use" + 0.019*"clay" + 0.015*"coconut"')
(3, '0.060*"http" + 0.033*"com" + 0.026*"chat" + 0.024*"www" + 0.023*"primal"')
(4, '0.011*"eat" + 0.011*"like" + 0.010*"day" + 0.010*"fat" + 0.009*"juice"')
0.0003110408782958984

0.0003110408782958984

### Summary
-    Topic 0: General, casual discussions involving opinions, knowledge sharing, and social interactions.
-    Topic 1: Raw food diets, particularly focusing on raw dairy products (milk, cheese, butter) and raw meat.
-    Topic 2: Natural health practices, including the use of water, oils, clay, and other natural products.
-    Topic 3: Online resources and discussions, particularly those related to the Primal Diet, with a focus on sharing links and participating in online          chats.
-    Topic 4: Dietary habits and preferences, with discussions on eating, daily routines, dietary fat, and juice.
These topics suggest that the dataset contains a variety of discussions ranging from casual social interactions to specific dietary and health practices. 
These topics suggest that the dataset contains a variety of discussions ranging from casual social interactions to specific dietary and health practices. 
Depending on your objectives, you could focus on one of these topics or combine them 
to create a comprehensive resource or tool that addresses the broader scope of health and dietary interests within your community.