### Topic modelling on my data

#### load and preprocess the data

In [58]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')  # This is often necessary for the WordNet lemmatizer in newer NLTK versions.



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [60]:
# Test loading WordNet
print(wn.synsets('dog'))  # Should return a list of Synsets for "dog"

# Test loading Stopwords
print(stopwords.words('english')[:10])  # Print first 10 English stopwords

# Test loading Open Multilingual WordNet
#print(wn.__version())  # Check if OMW is loaded by printing the version or similar attribute

[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


In [106]:
# Preprocessing 
def preprocess_text(text):
    try:
        tokenizer = RegexpTokenizer(r'\w+')
        lemmatizer = WordNetLemmatizer()
        tokens = tokenizer.tokenize(text.lower())  # Tokenize and convert to lower case
        cleaned_tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords.words('english')]
        return " ".join(cleaned_tokens)
    except Exception as e:
        print(f"Error processing text: {text}")
        raise e

In [107]:
# import data
df = pd.read_csv('../data/clean_data/combo_conversations_latest.csv', parse_dates=['Date'])
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 414128 entries, 0 to 414127
Data columns (total 4 columns):
 #   Column   Non-Null Count   Dtype         
---  ------   --------------   -----         
 0   Date     414128 non-null  datetime64[ns]
 1   Time     414128 non-null  object        
 2   User     414128 non-null  object        
 3   Content  414128 non-null  object        
dtypes: datetime64[ns](1), object(3)
memory usage: 12.6+ MB


In [108]:
# Apply preprocessing to your text column (assuming it's named 'Content')
t0=time.time()
df['processed_content'] = df['Content'].apply(preprocess_text)
t1=time.time()
print((t1-t0)/60)

21.64101659456889


In [31]:
# import data
import time
t0=time.time()
df_sampled = pd.read_csv('../data/clean_data/temporally_sampled_dataset.csv')
df_sampled.info()
# Apply preprocessing to your text column (assuming it's named 'Content')
df_sampled['processed_content'] = df_sampled['Content'].apply(preprocess_text)
t1=time.time()
print((t1-t0)/60)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6800 entries, 0 to 6799
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Date     6800 non-null   object
 1   Time     6800 non-null   object
 2   User     6800 non-null   object
 3   Content  6800 non-null   object
dtypes: object(4)
memory usage: 212.6+ KB
0.9337754845619202


In [29]:
time.time()

1726590228.6581056

In [23]:
from gensim import corpora
from gensim.models.ldamodel import LdaModel


In [24]:
# Prepare text for LDA analysis
texts = [text.split() for text in df['processed_content']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True)


In [25]:
# Print the topics
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)

(0, '0.029*"one" + 0.024*"yes" + 0.019*"body" + 0.017*"year" + 0.016*"way"')
(1, '0.036*"water" + 0.028*"1" + 0.028*"formula" + 0.023*"http" + 0.020*"coconut"')
(2, '0.039*"like" + 0.023*"would" + 0.020*"think" + 0.018*"know" + 0.018*"people"')
(3, '0.030*"raw" + 0.023*"get" + 0.022*"milk" + 0.020*"eat" + 0.020*"meat"')
(4, '0.029*"juice" + 0.029*"aajonus" + 0.028*"diet" + 0.025*"primal" + 0.019*"use"')


- (0, '0.029*"one" + 0.024*"yes" + 0.019*"body" + 0.017*"year" + 0.016*"way"')
- (1, '0.036*"water" + 0.028*"1" + 0.028*"formula" + 0.023*"http" + 0.020*"coconut"')
- (2, '0.039*"like" + 0.023*"would" + 0.020*"think" + 0.018*"know" + 0.018*"people"')
- (3, '0.030*"raw" + 0.023*"get" + 0.022*"milk" + 0.020*"eat" + 0.020*"meat"')
- (4, '0.029*"juice" + 0.029*"aajonus" + 0.028*"diet" + 0.025*"primal" + 0.019*"use"')

In [33]:
# Prepare text for LDA analysis
texts = [text.split() for text in df_sampled['processed_content']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True)
# Print the topics
t0=time.time()
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)
t1=time.time()
total=t1-t0
print(total/60)

(0, '0.012*"primal" + 0.011*"diet" + 0.009*"aajonus" + 0.008*"water" + 0.007*"still"')
(1, '0.017*"http" + 0.009*"com" + 0.008*"www" + 0.006*"think" + 0.005*"oh"')
(2, '0.036*"raw" + 0.017*"cheese" + 0.013*"food" + 0.012*"juice" + 0.011*"meat"')
(3, '0.018*"like" + 0.012*"good" + 0.012*"milk" + 0.010*"fat" + 0.010*"yes"')
(4, '0.016*"get" + 0.015*"would" + 0.012*"meat" + 0.011*"time" + 0.011*"yeah"')
2.7585029602050782e-05


In [35]:
# (0, '0.012*"primal" + 0.011*"diet" + 0.009*"aajonus" + 0.008*"water" + 0.007*"still"')
# (1, '0.017*"http" + 0.009*"com" + 0.008*"www" + 0.006*"think" + 0.005*"oh"')
# (2, '0.036*"raw" + 0.017*"cheese" + 0.013*"food" + 0.012*"juice" + 0.011*"meat"')
# (3, '0.018*"like" + 0.012*"good" + 0.012*"milk" + 0.010*"fat" + 0.010*"yes"')
# (4, '0.016*"get" + 0.015*"would" + 0.012*"meat" + 0.011*"time" + 0.011*"yeah"')

In [104]:
# sampling in a difference way by getting the latest



In [109]:
# Set the 'Date' column as the index of the DataFrame
df.set_index('Date', inplace=True)
# Sort the DataFrame by the index (Date)
df.sort_index(inplace=True)
df.head()



Unnamed: 0_level_0,Time,User,Content,processed_content
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2019-02-14,08:02,Silmavi,Hello!,hello
2019-02-14,15:40,Silmavi,when you eat it alone?,eat alone
2019-02-14,15:42,Silmavi,Good idea to eat it once wuth honey and once a...,good idea eat wuth honey alone
2019-02-14,15:43,Silmavi,My kids eat it with the meal.They aren't at ho...,kid eat meal home btw meal
2019-02-14,15:43,Silmavi,I will do it during week end,week end


In [110]:
# Define a more specific time range
start_datetime = '2024-06-15'
end_datetime = '2024-09-14'

# Slice the DataFrame for this specific range
sliced_df = df.loc[start_datetime:end_datetime]
sliced_df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 14429 entries, 2024-06-15 to 2024-09-14
Data columns (total 4 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Time               14429 non-null  object
 1   User               14429 non-null  object
 2   Content            14429 non-null  object
 3   processed_content  14429 non-null  object
dtypes: object(4)
memory usage: 563.6+ KB


In [111]:
# Prepare text for LDA analysis
texts = [text.split() for text in sliced_df['processed_content']]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]

# LDA model
lda_model = LdaModel(corpus=corpus, id2word=dictionary, num_topics=5, random_state=100, update_every=1, passes=10, alpha='auto', per_word_topics=True)
# Print the topics
t0=time.time()
topics = lda_model.print_topics(num_words=5)
for topic in topics:
    print(topic)
t1=time.time()
total=t1-t0
print(total/60)

(0, '0.028*"day" + 0.027*"diet" + 0.023*"primal" + 0.019*"yes" + 0.018*"2"')
(1, '0.017*"good" + 0.015*"like" + 0.011*"aajonus" + 0.010*"time" + 0.010*"even"')
(2, '0.017*"like" + 0.014*"lol" + 0.011*"shit" + 0.010*"look" + 0.009*"old"')
(3, '0.021*"urine" + 0.021*"u" + 0.017*"http" + 0.016*"bath" + 0.013*"hot"')
(4, '0.035*"raw" + 0.025*"cheese" + 0.020*"eat" + 0.015*"know" + 0.015*"milk"')
4.2227904001871747e-05
