# VectorSpaceModel
作者：周阳

主要内容：

1,数据预处理

2,词袋模型构建（vocb）

3,TF-IDF权值计算

4,永久化储存向量

## 数据预处理

In [1]:
import numpy as np
import os
import nltk ##数据预处理库
import codecs ##只定编码打开
import chardet #检测文件编码格式
import time

### 数据加载

In [2]:
def traverse(f,result):
    
    fs = os.listdir(f)
    for f1 in fs:
        tmp_path = os.path.join(f,f1)
        if not os.path.isdir(tmp_path):
            result.append(tmp_path)
        else:
            print(tmp_path)
            traverse(tmp_path,result)


document_path = "/home/blueberry/data/datamining/20news-18828"
document_paths = []
traverse(document_path,document_paths)

/home/blueberry/data/datamining/20news-18828/misc.forsale
/home/blueberry/data/datamining/20news-18828/rec.sport.hockey
/home/blueberry/data/datamining/20news-18828/talk.politics.mideast
/home/blueberry/data/datamining/20news-18828/sci.space
/home/blueberry/data/datamining/20news-18828/talk.politics.misc
/home/blueberry/data/datamining/20news-18828/sci.med
/home/blueberry/data/datamining/20news-18828/sci.electronics
/home/blueberry/data/datamining/20news-18828/comp.sys.ibm.pc.hardware
/home/blueberry/data/datamining/20news-18828/alt.atheism
/home/blueberry/data/datamining/20news-18828/soc.religion.christian
/home/blueberry/data/datamining/20news-18828/talk.politics.guns
/home/blueberry/data/datamining/20news-18828/rec.autos
/home/blueberry/data/datamining/20news-18828/rec.motorcycles
/home/blueberry/data/datamining/20news-18828/comp.graphics
/home/blueberry/data/datamining/20news-18828/comp.windows.x
/home/blueberry/data/datamining/20news-18828/talk.religion.misc
/home/blueberry/data/d

In [70]:
print('D(文档数):',len(document_paths)/2)

D(文档数): 18828.0


#### 数据操作
1,去停用词

2，去特殊符号

3,词型还原（lemmatization）

4,词干提取（stemming）

In [79]:
from nltk.corpus import stopwords
stopwords.words('english')
sr = stopwords.words('english')
##停用词
symbol = [',','.',':','_','!','?','/','\'','\"','*','>','<','@','~','-','(',')','%','=','\\','^'
          ,'&','|','#','$','0','1','2','3','4','5','6','7','8','9','10','[',']','+'] 
##特殊符号
from nltk.stem import SnowballStemmer as stemmer
st = stemmer("english") 
now = time.time()
for f_path in document_paths:
    if(f_path[-4:]!='_pre'):
        f = open(f_path,'rb')
        data = f.read()
        encode = chardet.detect(data)['encoding']
        ##检测文件的encode类型
        with codecs.open(f_path, encoding=encode) as f:
            write_f = open(f_path+"_pre","w+")
            line = f.read()
            ##去特殊符号
            for s in symbol:
                  line = line.replace(s,'')
            line = line.split()
            ##去停用词,词干提取，小写化单词
            newline = []
            for word in line:
                word = st.stem(word)
                if word not in sr:
                    newline.append(word)
            ##写回        
            write_f.writelines('\n'.join(newline))
            write_f.close()
print('spend:',time.time()-now,'s')

spend: 93.05670356750488 s


## 构建Vocab

In [80]:
vocab = {}
for f_path in document_paths:
    if(f_path[-4:]=='_pre'):
        f = open(f_path,'rb')
        line = f.read()
        line = line.split()
        for word in line:
            if vocab.get(word) == None:
                vocab[word] = 1
            else :
                vocab[word] += 1

In [81]:
vocab

{b'radleygibbsoituncedu': 10,
 b'jongsma': 5,
 b'svenbeowulfjplnasagov': 1,
 b'familiy': 1,
 b'sasuxacsouiucedu': 1,
 b'considerest': 1,
 b'keyn': 10,
 b'eso': 2,
 b'carlino': 1,
 b'giboosh': 1,
 b'misha': 5,
 b'socioeconom': 12,
 b'eotvo': 1,
 b'noaa': 9,
 b'telephon': 339,
 b'takeoff': 11,
 b'cxocwapollohpcom': 1,
 b'derrel': 4,
 b'mxlfmxkhsxmdbgaxtpgi': 1,
 b'freund': 4,
 b'rayolecdaccom': 6,
 b'mj`szz;obhdfkstkotpjihshdjgk`': 1,
 b'diack': 14,
 b'sazhookcorpmotcom': 6,
 b'loosest': 2,
 b'satelit': 5,
 b'mcucggcubaxxcxuacccc': 1,
 b'topexposeidon': 7,
 b'hyperthyroid': 3,
 b'gxp': 1,
 b'wordprocessor': 20,
 b'upbring': 13,
 b'mhfahdblbjatvihxyzx': 1,
 b'iv;pp': 1,
 b'mqkumccumichedu': 1,
 b'nationalreligi': 1,
 b'ages;': 1,
 b'joyous': 4,
 b'oppress': 111,
 b'disorien': 1,
 b'brianlplarizonaedu': 48,
 b'prest': 1,
 b'bangor': 1,
 b'sysquest': 3,
 b'disclaimertypethingi': 2,
 b'kaldis;': 2,
 b'mdanjougelulavalca': 1,
 b'nhfozsrgm;lryxrorrz': 1,
 b'motifolit': 1,
 b'mbsutrcresutccom':

## 构建TF-IDF 向量并储存

tf(w,d) = count(w, d) / size(d)

idf = log(n / docs(w, D))

tf-idf = tf × idf

#### 进度条

In [180]:
class ProgressBar:
    def __init__(self, count = 0, total = 0, width = 50):
        self.count = count
        self.total = total
        self.width = width
        print('----------------------------------------------Progressing-------------------------------------------')
    def move(self):
        self.count += 1
    def log(self):
        sys.stdout.write(' ' * (self.width + 9) + '\r')
        sys.stdout.flush()
        progress = self.width * self.count / self.total
        sys.stdout.write('{0:3}/{1:3} with {2} finished: '.format(self.count, self.total ,str(self.count/self.total*100)[:4]+'%'))
        sys.stdout.write('#' * int(progress) + '-' * int(self.width - progress) + '\r')
        if progress == self.width:
            sys.stdout.write('\n')
        sys.stdout.flush()

In [181]:
all_word_num = 0
for key in vocab:
    all_word_num+=vocab[key]
print('wordnum',all_word_num)
print('vocabsize',len(vocab))

wordnum 2756174
vocabsize 133017


In [184]:
import math
now = time.time()
keys = list(vocab.keys())
bar = ProgressBar(total = 18828)
d_vector = np.zeros(len(vocab))##出现的文档
for f_path in document_paths:
    if(f_path[-4:]=='_pre'):
        f = open(f_path,'rb')
        line = f.read()
        line = set(line.split())
     
        for word in line:
            d_vector[keys.index(word)] += 1
        bar.move()##进度条
        bar.log()
        f.close()

print(d_vector)
print('spend:',time.time()-now,'s')

----------------------------------------------Progressing-------------------------------------------
18828/18828.0 with 100.% finished: #####################################################################################-------------#################################################
[4. 3. 1. ... 1. 2. 1.]
spend: 2531.474944829941 s


In [188]:
vec_save_file = '/home/blueberry/data/datamining/vec/tf_idf_vector.npy'
file_paths_file = '/home/blueberry/data/datamining/vec/file_paths'
##存的文件位置，超参数！
now = time.time()
for key in vocab:
    idf_vetor = np.log2(int(18828)/d_vector + 1)
print('1, spend:',time.time()-now,'s')
now = time.time()
bar = ProgressBar(total = 18828)

paths_f = open(file_paths_file,'w+')
vector_list = []

for f_path in document_paths:
    if(f_path[-4:]=='_pre'):
        f = open(f_path,'rb')
        saved_document
        tf_vector = np.zeros(len(vocab))##词频
        line = f.read()
        line = line.split()
        keys = list(vocab.keys())
        for word in line:
            tf_vector[keys.index(word)]+=1
        tf_vector /= len(line)
        tf_idf_vector = tf_vector * idf_vetor
        vector_list.append(tf_idf_vector)
        paths_f.writeline(f_path)
        bar.move()##进度条
        bar.log()
        f.close()
np.save(vec_save_file,vector_list)##把所有向量储存在npy文件中
print('2, spend:',time.time()-now,'s')

1, spend: 340.6000633239746 s
----------------------------------------------Progressing-------------------------------------------
18828/18828.0 with 100.% finished: ##################################################
2, spend: 3814.4731221199036 s
