# VectorSpaceModel
作者：周阳

主要内容：

1,数据预处理

2,词袋模型构建（vocb）

3,TF-IDF权值计算

4,永久化储存向量

## 数据预处理

In [1]:
import numpy as np
import os
import nltk ##数据预处理库
import codecs ##只定编码打开
import chardet #检测文件编码格式
import time

### 数据加载

In [2]:
def traverse(f,result):
    
    fs = os.listdir(f)
    for f1 in fs:
        tmp_path = os.path.join(f,f1)
        if not os.path.isdir(tmp_path):
            result.append(tmp_path)
        else:
            print(tmp_path)
            traverse(tmp_path,result)


document_path = "/home/blueberry/data/datamining/20news-18828"
document_paths = []
traverse(document_path,document_paths)

/home/blueberry/data/datamining/20news-18828/misc.forsale
/home/blueberry/data/datamining/20news-18828/rec.sport.hockey
/home/blueberry/data/datamining/20news-18828/talk.politics.mideast
/home/blueberry/data/datamining/20news-18828/sci.space
/home/blueberry/data/datamining/20news-18828/talk.politics.misc
/home/blueberry/data/datamining/20news-18828/sci.med
/home/blueberry/data/datamining/20news-18828/sci.electronics
/home/blueberry/data/datamining/20news-18828/comp.sys.ibm.pc.hardware
/home/blueberry/data/datamining/20news-18828/alt.atheism
/home/blueberry/data/datamining/20news-18828/soc.religion.christian
/home/blueberry/data/datamining/20news-18828/talk.politics.guns
/home/blueberry/data/datamining/20news-18828/rec.autos
/home/blueberry/data/datamining/20news-18828/rec.motorcycles
/home/blueberry/data/datamining/20news-18828/comp.graphics
/home/blueberry/data/datamining/20news-18828/comp.windows.x
/home/blueberry/data/datamining/20news-18828/talk.religion.misc
/home/blueberry/data/d

In [3]:
print('D(文档数):',len(document_paths)/2)

D(文档数): 18828.0


#### 数据操作
1,去停用词

2，去特殊符号

3,词型还原（lemmatization）

4,词干提取（stemming）

In [4]:
from nltk.corpus import stopwords
stopwords.words('english')
sr = stopwords.words('english')
##停用词
symbol = [',','.',':','_','!','?','/','\'','\"','*','>','<','@','~','-','(',')','%','=','\\','^'
          ,'&','|','#','$','0','1','2','3','4','5','6','7','8','9','10','[',']','+'] 
##特殊符号
from nltk.stem import SnowballStemmer as stemmer
st = stemmer("english") 
now = time.time()
for f_path in document_paths:
    if(f_path[-4:]!='_pre'):
        f = open(f_path,'rb')
        data = f.read()
        encode = chardet.detect(data)['encoding']
        ##检测文件的encode类型
        with codecs.open(f_path, encoding=encode) as f:
            write_f = open(f_path+"_pre","w+")
            line = f.read()
            ##去特殊符号
            for s in symbol:
                  line = line.replace(s,'')
            line = line.split()
            ##去停用词,词干提取，小写化单词
            newline = []
            for word in line:
                word = st.stem(word)
                if word not in sr:
                    newline.append(word)
            ##写回        
            write_f.writelines('\n'.join(newline))
            write_f.close()
print('spend:',time.time()-now,'s')

spend: 94.43775844573975 s


## 构建Vocab

* 统计生成vocab
* 从大到小进行排序

In [5]:
vocab = {}
for f_path in document_paths:
    if(f_path[-4:]=='_pre'):
        f = open(f_path,'rb')
        line = f.read()
        line = line.split()
        for word in line:
            if vocab.get(word) == None:
                vocab[word] = 1
            else :
                vocab[word] += 1
v_tuple = sorted(vocab.items(),key = lambda x:x[1],reverse=True) ##从大到小排序
vocab = {}
for t in v_tuple:
    vocab[t[0]] = t[1]

In [6]:
vocab

{b'subject': 20629,
 b'one': 15042,
 b'use': 15007,
 b'write': 14825,
 b'would': 14749,
 b'articl': 11856,
 b'ani': 11506,
 b'like': 10636,
 b'get': 10462,
 b'dont': 9614,
 b'know': 9524,
 b'peopl': 9501,
 b'x': 8481,
 b'think': 8345,
 b'onli': 8111,
 b'time': 7944,
 b'say': 7799,
 b'also': 7213,
 b'go': 7204,
 b'make': 7177,
 b'doe': 6783,
 b'work': 6465,
 b'im': 6246,
 b'want': 5992,
 b'good': 5967,
 b'new': 5878,
 b'year': 5843,
 b'could': 5822,
 b'system': 5770,
 b'well': 5674,
 b'right': 5653,
 b'see': 5597,
 b'need': 5555,
 b'us': 5548,
 b'way': 5505,
 b'even': 5502,
 b'becaus': 5435,
 b'look': 5416,
 b'thing': 5355,
 b'problem': 5271,
 b'may': 5269,
 b'god': 5182,
 b'whi': 5139,
 b'veri': 5125,
 b'file': 4844,
 b'tri': 4794,
 b'much': 4785,
 b'mani': 4776,
 b'first': 4734,
 b'two': 4681,
 b'question': 4538,
 b'take': 4397,
 b'window': 4355,
 b'call': 4341,
 b'believ': 4256,
 b'anyon': 4197,
 b'come': 4195,
 b'point': 4167,
 b'post': 4151,
 b'program': 4067,
 b'run': 4021,
 b'see

## 构建TF-IDF 向量并储存

tf(w,d) = count(w, d) / size(d)

idf = log(n / docs(w, D))

tf-idf = tf × idf

#### 进度条

In [7]:
import sys

class ProgressBar:
    def __init__(self, count = 0, total = 0, width = 30):
        self.count = count
        self.total = total
        self.width = width
        print('-------------------------------Progressing----------------------------------')
    def move(self):
        self.count += 1
    def log(self):
        sys.stdout.write(' ' * (self.width + 9) + '\r')
        sys.stdout.flush()
        progress = self.width * self.count / self.total
        sys.stdout.write('{0:3}/{1:3} with {2} finished: '.format(self.count, self.total ,str(self.count/self.total*100)[:4]+'%'))
        sys.stdout.write('#' * int(progress) + '-' * int(self.width - progress) + '\r')
        if progress == self.width:
            sys.stdout.write('\n')
        sys.stdout.flush()

In [8]:
all_word_num = 0
for key in vocab:
    all_word_num+=vocab[key]
print('wordnum',all_word_num)
print('vocabsize',len(vocab))

wordnum 2756184
vocabsize 133015


In [9]:
import math
now = time.time()
keys = list(vocab.keys())
bar = ProgressBar(total = 18828)
d_vector = np.zeros(len(vocab))##出现的文档
for f_path in document_paths:
    if(f_path[-4:]=='_pre'):
        f = open(f_path,'rb')
        line = f.read()
        line = set(line.split())
     
        for word in line:
            d_vector[keys.index(word)] += 1
        
        bar.move()##进度条
        if bar.count%100 == 0:
            bar.log()
        f.close()
####计算出现向量
print()
print(d_vector)
print('spend:',time.time()-now,'s')

-------------------------------Progressing----------------------------------
18800/18828 with 99.8% finished: #############################
[  1.88280000e+04   7.29000000e+03   6.44500000e+03 ...,   1.00000000e+00
   1.00000000e+00   1.00000000e+00]
spend: 214.575519323349 s


In [10]:
##存的文件位置，超参数！
now = time.time()
###计算逆文档词频
bar = ProgressBar(total = len(vocab))
for key in vocab:
    idf_vetor = np.log2(int(18828)/(d_vector + 1))####这里使用了平滑技术
    bar.move()##进度条
    if bar.count%100 == 0:
        bar.log()
print()
print('spend:',time.time()-now,'s')

-------------------------------Progressing----------------------------------
133000/133015 with 99.9% finished: #############################
spend: 359.9720175266266 s


In [15]:
vec_save_path = '/media/blueberry/新加卷/dataset/datamining/tf_idf_vector.bin'
file_paths_file = '/media/blueberry/新加卷/dataset/datamining/file_paths'

now = time.time()
bar = ProgressBar(total = 18828)

paths_f = open(file_paths_file,'w+')
vec_save_file = open(vec_save_path,'wb')##使用二进制形式打开，使用二进制形式保存vec，注意取得时候顺序存放，顺序取出
pre_paths = []



###计算文档词频 与 tf-idf 权重
for f_path in document_paths:
    if(f_path[-4:]=='_pre'):
        pre_paths.append(f_path)
        f = open(f_path,'rb')
        tf_vector = np.zeros(len(vocab))##词频
        line = f.read()
        line = line.split()
        keys = list(vocab.keys())
        for word in line:
            tf_vector[keys.index(word)]+=1
        tf_vector /= len(line)
        tf_idf_vector = tf_vector * idf_vetor
        np.save(vec_save_file,tf_idf_vector) ###顺序存入bin文件
        bar.move()##进度条
        if bar.count%100 == 0:
            bar.log()
        f.close()
paths_f.writelines('\n'.join(pre_paths))
paths_f.close()
vec_save_file.close()
print()
print('2, spend:',time.time()-now,'s')

-------------------------------Progressing----------------------------------
18800/18828 with 99.8% finished: #############################
2, spend: 601.7968020439148 s


In [16]:
f = open(vec_save_path,'rb')
a = np.load(f)

array([ -1.74143047e-06,   0.00000000e+00,   0.00000000e+00, ...,
         0.00000000e+00,   0.00000000e+00,   0.00000000e+00])