In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import spacy
import gensim
from gensim import corpora
from gensim.models import LdaModel
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models as gensimvis

In [2]:
data = pd.read_csv("fake_or_real_news_with_splits.csv")

nlp = spacy.load("en_core_web_sm")

In [3]:
def preprocess(text):
    doc = nlp(text)
    tokens = [token.lemma_ for token in doc if not token.is_punct and not token.is_stop]
    return tokens

data['processed_text'] = data['text'].apply(preprocess)

# 创建词典和语料库
dictionary = corpora.Dictionary(data['processed_text'])
corpus = [dictionary.doc2bow(text) for text in data['processed_text']]

# 执行LDA主题建模
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15, random_state=42)

# 计算主题一致性得分
coherence_model_lda = CoherenceModel(model=lda_model, texts=data['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

# 可视化主题分布
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

Coherence Score: 0.48614968373474865


In [4]:
fake_data = data[data['category'] == 'FAKE']

fake_data['processed_text'] = fake_data['text'].apply(preprocess)

# 创建词典和语料库
dictionary = corpora.Dictionary(fake_data['processed_text'])
corpus = [dictionary.doc2bow(text) for text in fake_data['processed_text']]

# 执行LDA主题建模
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15, random_state=42)

# 计算主题一致性得分
coherence_model_lda = CoherenceModel(model=lda_model, texts=fake_data['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

# 可视化主题分布
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fake_data['processed_text'] = fake_data['text'].apply(preprocess)


Coherence Score: 0.44401140068104433


In [5]:
real_data = data[data['category'] == 'REAL']

real_data['processed_text'] = real_data['text'].apply(preprocess)

# 创建词典和语料库
dictionary = corpora.Dictionary(real_data['processed_text'])
corpus = [dictionary.doc2bow(text) for text in real_data['processed_text']]

# 执行LDA主题建模
lda_model = LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15, random_state=42)

# 计算主题一致性得分
coherence_model_lda = CoherenceModel(model=lda_model, texts=real_data['processed_text'], dictionary=dictionary, coherence='c_v')
coherence_lda = coherence_model_lda.get_coherence()
print(f'Coherence Score: {coherence_lda}')

# 可视化主题分布
vis_data = gensimvis.prepare(lda_model, corpus, dictionary)
pyLDAvis.display(vis_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data['processed_text'] = real_data['text'].apply(preprocess)


Coherence Score: 0.50858996486739
