导入所需要的包

In [2]:
import numpy as np
import pandas as pd
import re

这里使用希拉里的邮件作为语料库

In [4]:
df = pd.read_csv('input/HillaryEmails.csv')
df = df[['Id','ExtractedBodyText']].dropna() #去掉空格
df

Unnamed: 0,Id,ExtractedBodyText
1,2,"B6\nThursday, March 3, 2011 9:45 PM\nH: Latest..."
2,3,Thx
4,5,"H <hrod17@clintonemail.com>\nFriday, March 11,..."
5,6,Pis print.\n-•-...-^\nH < hrod17@clintonernail...
7,8,"H <hrod17@clintonemail.corn>\nFriday, March 11..."
8,9,FYI
9,10,"B6\nWednesday, September 12, 2012 6:16 PM\nFwd..."
10,11,Fyi\nB6\n— —
11,12,"B6\nWednesday, September 12, 2012 6:16 PM\nFwd..."
12,13,Fyi


使用正则表达式对文本进行预处理

In [5]:
def clean_email_text(text):
    text = text.replace('\n'," ") #新行，我们是不需要的
    text = re.sub(r"-", " ", text) #把 "-" 的两个单词，分开。（比如：july-edu ==> july edu）
    text = re.sub(r"\d+/\d+/\d+", "", text) #日期，对主体模型没什么意义
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) #时间，没意义
    text = re.sub(r"[\w]+@[\.\w]+", "", text) #邮件地址，没意义
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) #网址，没意义
    pure_text = ''
    # 以防还有其他特殊字符（数字）等等，我们直接把他们loop一遍，过滤掉
    for letter in text:
        # 只留下字母和空格
        if letter.isalpha() or letter==' ':
            pure_text += letter
    # 再把那些去除特殊字符后落单的单词，直接排除。
    # 我们就只剩下有意义的单词了。
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

In [6]:
docs = df['ExtractedBodyText']
docs = docs.apply(lambda s: clean_email_text(s))  
docs.head(1).values

array(['Thursday March PM Latest How Syria is aiding Qaddafi and more Sid hrc memo syria aiding libya docx hrc memo syria aiding libya docx March For Hillary'],
      dtype=object)

In [7]:
doclist = docs.values

使用Gensim进行模型构建

In [8]:
from gensim import corpora, models, similarities
import gensim
stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
            'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
            'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
            'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
            'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
            'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
            'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
            'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
            'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
            'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
            'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
            'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']



In [9]:
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in doclist]
dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]#建立语料库

LdaModel的参数解释：
corpus:文本，已经表示成词袋了。 
num_topics: 提取的主题数 
id2word:词典 
passes:类似于在机器学习中常见的epoch，也就是训练了多少轮。

In [11]:
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20,passes = 10)#建立模型

打印出所有主题，每个主题包含五个占比最高的单词

In [12]:
lda.print_topics(num_topics=20, num_words=5)

[(0,
  '0.023*"wjc" + 0.017*"sounds" + 0.017*"negotiating" + 0.016*"karzai" + 0.011*"hrc"'),
 (1,
  '0.033*"nuclear" + 0.027*"iran" + 0.020*"iranian" + 0.014*"border" + 0.014*"north"'),
 (2,
  '0.011*"vote" + 0.009*"northern" + 0.008*"uup" + 0.007*"assembly" + 0.006*"woodward"'),
 (3,
  '0.031*"pm" + 0.021*"huma" + 0.020*"sullivan" + 0.020*"abedin" + 0.015*"percent"'),
 (4,
  '0.011*"calling" + 0.010*"wilders" + 0.008*"readout" + 0.008*"got" + 0.007*"kyl"'),
 (5,
  '0.010*"think" + 0.010*"one" + 0.010*"get" + 0.010*"would" + 0.009*"like"'),
 (6,
  '0.053*"state" + 0.031*"doc" + 0.026*"benghazi" + 0.025*"subject" + 0.024*"date"'),
 (7,
  '0.008*"obama" + 0.007*"republican" + 0.006*"would" + 0.006*"said" + 0.006*"party"'),
 (8,
  '0.017*"party" + 0.016*"said" + 0.014*"mr" + 0.010*"mcchrystal" + 0.010*"last"'),
 (9,
  '0.045*"call" + 0.019*"pls" + 0.015*"print" + 0.015*"tomorrow" + 0.013*"talk"'),
 (10,
  '0.010*"please" + 0.008*"marie" + 0.008*"anne" + 0.007*"office" + 0.007*"kabul"'),
 

对新文本找出主题分布

In [24]:
data = []
for line in open("Input/post.txt","r"): #设置文件对象并读取每一行文件
    line = line.strip('\n')  #去掉空行
    line = line.split(' ')
    data.append(line)               #将每一行文件加入到list中
data

[['To',
  'all',
  'the',
  'little',
  'girls',
  'watching...never',
  'doubt',
  'that',
  'you',
  'are',
  'valuable',
  'and',
  'powerful',
  '&',
  'deserving',
  'of',
  'every',
  'chance',
  '&',
  'opportunity',
  'in',
  'the',
  'world.'],
 [''],
 ['I',
  'was',
  'greeted',
  'by',
  'this',
  'heartwarming',
  'display',
  'on',
  'the',
  'corner',
  'of',
  'my',
  'street',
  'today.',
  'Thank',
  'you',
  'to',
  'all',
  'of',
  'you',
  'who',
  'did',
  'this.',
  'Happy',
  'Thanksgiving.',
  '-H'],
 [''],
 ['Hoping',
  'everyone',
  'has',
  'a',
  'safe',
  '&',
  'Happy',
  'Thanksgiving',
  'today,',
  '&',
  'quality',
  'time',
  'with',
  'family',
  '&',
  'friends.',
  '-H'],
 [''],
 ['Scripture',
  'tells',
  'us:',
  'Let',
  'us',
  'not',
  'grow',
  'weary',
  'in',
  'doing',
  'good,',
  'for',
  'in',
  'due',
  'season,',
  'we',
  'shall',
  'reap,',
  'if',
  'we',
  'do',
  'not',
  'lose',
  'heart.'],
 [''],
 ['Let',
  'us',
  'have',
  '

In [40]:
#对新文本进行主题识别，判断是什么主题，list第一个参数表示的id，第二个表示属于这个id的概率
lda_list=[]
for texts in data:
    doc_bow = dictionary.doc2bow(texts)
    doc_lda = lda[doc_bow]
lda_list.append(doc_lda)
lda_list

[[(5, 0.56937456), (7, 0.28062543)]]