![](http://pic1.tsingdataedu.com/ML%E6%A1%88%E4%BE%8Bbanner.jpg)

# 作业：使用LDA模型破解希拉里“邮件门”的秘密
作者：加号、助教-Noah

## 任务描述：
基于课程中的LDA模型，判断希拉里Twitter中的每句话分别属于哪个Topic<br>

1. 构建LDA模型<br>
2. 任务一：读取数据<br>
3. 任务二：数据预处理<br>
4. 任务三：词袋化<br>
5. 任务四：判断主题<br>

## 1. 创建LDA模型

In [1]:
# 请勿修改次单元格
import numpy as np
import pandas as pd
import re


df = pd.read_csv("./input/HillaryEmails.csv")
# 原邮件数据中有很多Nan的值，直接扔了。
df = df[['Id','ExtractedBodyText']].dropna()

### 文本预处理

def clean_email_text(text):
    text = text.replace('\n'," ") #新行，我们是不需要的
    text = re.sub(r"-", " ", text) #把 "-" 的两个单词，分开。（比如：pre-processing ==> pre processing）
    text = re.sub(r"\d+/\d+/\d+", "", text) #日期，对主体模型没什么意义
    text = re.sub(r"[0-2]?[0-9]:[0-6][0-9]", "", text) #时间，没意义
    text = re.sub(r"[\w]+@[\.\w]+", "", text) #邮件地址，没意义
    text = re.sub(r"/[a-zA-Z]*[:\//\]*[A-Za-z0-9\-_]+\.+[A-Za-z0-9\.\/%&=\?\-_]+/i", "", text) #网址，没意义
    pure_text = ''
    # 以防还有其他特殊字符（数字）等等，我们直接把他们loop一遍，过滤掉
    for letter in text:
        # 只留下字母和空格
        if letter.isalpha() or letter==' ':
            pure_text += letter
    # 再把那些去除特殊字符后落单的单词，直接排除。
    # 我们就只剩下有意义的单词了。
    text = ' '.join(word for word in pure_text.split() if len(word)>1)
    return text

docs = df['ExtractedBodyText']
docs = docs.apply(lambda s: clean_email_text(s))  

# 把邮件内容拿出来
doclist = docs.values


### 构建LDA模型

from gensim import corpora, models, similarities
import gensim
#停止词列表
stoplist = ['very', 'ourselves', 'am', 'doesn', 'through', 'me', 'against', 'up', 'just', 'her', 'ours', 
            'couldn', 'because', 'is', 'isn', 'it', 'only', 'in', 'such', 'too', 'mustn', 'under', 'their', 
            'if', 'to', 'my', 'himself', 'after', 'why', 'while', 'can', 'each', 'itself', 'his', 'all', 'once', 
            'herself', 'more', 'our', 'they', 'hasn', 'on', 'ma', 'them', 'its', 'where', 'did', 'll', 'you', 
            'didn', 'nor', 'as', 'now', 'before', 'those', 'yours', 'from', 'who', 'was', 'm', 'been', 'will', 
            'into', 'same', 'how', 'some', 'of', 'out', 'with', 's', 'being', 't', 'mightn', 'she', 'again', 'be', 
            'by', 'shan', 'have', 'yourselves', 'needn', 'and', 'are', 'o', 'these', 'further', 'most', 'yourself', 
            'having', 'aren', 'here', 'he', 'were', 'but', 'this', 'myself', 'own', 'we', 'so', 'i', 'does', 'both', 
            'when', 'between', 'd', 'had', 'the', 'y', 'has', 'down', 'off', 'than', 'haven', 'whom', 'wouldn', 
            'should', 've', 'over', 'themselves', 'few', 'then', 'hadn', 'what', 'until', 'won', 'no', 'about', 
            'any', 'that', 'for', 'shouldn', 'don', 'do', 'there', 'doing', 'an', 'or', 'ain', 'hers', 'wasn', 
            'weren', 'above', 'a', 'at', 'your', 'theirs', 'below', 'other', 'not', 're', 'him', 'during', 'which']

#人工分词
texts = [[word for word in doc.lower().split() if word not in stoplist] for doc in doclist]


### 建立语料库

dictionary = corpora.Dictionary(texts)
corpus = [dictionary.doc2bow(text) for text in texts]
lda = gensim.models.ldamodel.LdaModel(corpus=corpus, id2word=dictionary, num_topics=20)
# 打印出所有主题
lda.print_topics(num_topics=20, num_words=5)



[(0,
  '0.007*"percent" + 0.007*"would" + 0.006*"today" + 0.006*"discuss" + 0.005*"heard"'),
 (1,
  '0.008*"state" + 0.007*"secretary" + 0.007*"us" + 0.006*"federal" + 0.004*"department"'),
 (2,
  '0.008*"said" + 0.008*"would" + 0.008*"party" + 0.006*"bill" + 0.004*"could"'),
 (3,
  '0.007*"negotiating" + 0.007*"original" + 0.007*"verveer" + 0.006*"wing" + 0.006*"camps"'),
 (4,
  '0.028*"pm" + 0.008*"huma" + 0.007*"office" + 0.007*"abedin" + 0.006*"secretary"'),
 (5,
  '0.010*"pm" + 0.009*"office" + 0.007*"please" + 0.006*"beck" + 0.005*"cdm"'),
 (6,
  '0.018*"state" + 0.018*"taliban" + 0.012*"secretary" + 0.011*"assistant" + 0.009*"lona"'),
 (7,
  '0.011*"part" + 0.010*"release" + 0.008*"think" + 0.007*"know" + 0.006*"im"'),
 (8,
  '0.022*"bloomberg" + 0.010*"print" + 0.010*"pis" + 0.006*"thanks" + 0.006*"burns"'),
 (9, '0.007*"would" + 0.006*"got" + 0.006*"thx" + 0.006*"done" + 0.006*"get"'),
 (10,
  '0.005*"one" + 0.005*"president" + 0.004*"think" + 0.004*"new" + 0.004*"people"'),
 

## 2.希拉里twitter主题分类
### 任务一：读取数据

In [10]:
# 读取数据；将数据的列名定义为：text；并将该列数据存放至名为twitt_docs的变量中






### 任务二：文本预处理

In [1]:
# 利用课上定义的clean_model_text函数，进行数据预处理；处理后的结果命名为：testdata




### 任务三：词袋化(利用email建立的dictionary)

In [13]:
# 进行人工分词，并生成词袋




### 任务四：判断twitter所属的主题

In [None]:
# 判断主题




### 版权归 © 稀牛学院 所有 保留所有权利

![](http://pic1.tsingdataedu.com/%E7%A8%80%E7%89%9B%20x%20%E7%BD%91%E6%98%93.png)