基于train+test数据集的20万个用户的搜索历史训练word2vec模型，生成word2vec向量并使用2-fold验证word2vec性能。

    data_preprocess函数: 预处理类。
    word2vec_alogrithm: 主功能类，完成word2vec的训练、生成和验证。

In [1]:
import csv
import time
import numpy as np
import pandas as pd
from gensim.models import word2vec
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.linear_model import SGDClassifier, LogisticRegression
from utils import get_data_from_csv

import warnings
warnings.filterwarnings('ignore')

In [100]:
class Word2vec_algorithm():
    def __init__(self, size=300):
        random_rate = 2019
        self.size = size
        self.LR = LogisticRegression(C=1.0, max_iter=100, class_weight='balanced', random_state=random_rate, n_jobs=-1)
        
    def train(self, filename, save_path='../data/20w_size300_win100.model'):
        """ 训练word2vec模型 """
        sentences = word2vec.LineSentence(filename) # 加载语料，要求语料一行一文本
        
        print('正在训练word2vec词向量，预料为: {}，size为: {}'.format(filename, self.size))
        model = word2vec.Word2Vec(sentences, size=self.size, window=100, workers=48)
        
        print('训练word2vec词向量完毕，已保存在: {}'.format(save_path))
        model.save(save_path)
        
    def load_model_and_transform_sentence_to_avgvec(self, X, model_path='../data/20w_size300_win100.model'):
        """ 
        载入模型，生成word2vec向量 
        :param X: 读入的文档，list
        :return: np.array
        """
        print('载入模型中...')
        model = word2vec.Word2Vec.load(model_path)
        print('载入模型成功!')
        
        res = np.zeros((len(X), self.size))
        print('正在生成word2vec向量...')
        for i, line in enumerate(X):
            terms = line.split()
            count = 0
            for j, term in enumerate(terms):
                try:
                    count += 1
                    res[i] += np.array(model[term])
                except:
                    1 == 1
            if count !=0:
                res[i] = res[i] / float(count) # 求均值
        return res
    
    def fit_and_predict(self, X, Y, T):
        """ 根据X和Y训练模型，对测试集数据T做出预测 """
        print('正在使用Logistic训练模型...')
        self.LR.fit(X, Y)
        result = self.LR.predict(T)
        return result
    
    def validation(self, X, Y, kind=''):
        """ 使用2-fold进行验证 """
        print('validating ', kind)
        fold_n = 2
        skf = StratifiedKFold(n_splits=fold_n)
        score = np.zeros(fold_n)
        for j, (train_idx, test_idx) in enumerate(skf.split(X, Y)):
            print(str(j+1)+'-fold')
            X_train = X[train_idx]
            y_train = Y[train_idx]
            X_test = X[test_idx]
            y_test = Y[test_idx]
            y_pred = self.fit_and_predict(X_train, y_train, X_test)
            cur_score = sum(y_pred==y_test) * 1.0 / len(y_pred)
            score[j] = cur_score
        print(score, score.mean())
        return score.mean()
    

In [6]:
# 去分词后的数据集的前几行进行测试
#sub_train_seg_words_file_path = '../data/sub_train_seg_words.csv'
'''
t = 500 # 去10行数据
with open(sub_train_seg_words_file_path, 'w') as wf:
    with open(train_seg_words_file_path, 'r') as f:
        for line in f:
            line.strip()
            t -= 1
            wf.writelines(line)
            if t == 0: 
                break
        f.close()
    wf.close()
'''

"\nt = 500 # 去10行数据\nwith open(sub_train_seg_words_file_path, 'w') as wf:\n    with open(train_seg_words_file_path, 'r') as f:\n        for line in f:\n            line.strip()\n            t -= 1\n            wf.writelines(line)\n            if t == 0: \n                break\n        f.close()\n    wf.close()\n"

步骤：
- 1.先训练word2vec的model
- 2.然后再生成word2vec向量 
- 3.最后使用2-fold验证效果

目的：
- 生成word2vec向量，提供给下一个步骤：特征融合。

In [7]:
train_seg_words_path = '../data/train_seg_words.csv'
test_seg_words_path = '../data/test_seg_words.csv'
full_seg_words_path = '../data/full_seg_words.csv'

In [101]:
# 主功能类
word2vec_algorithm_obj = Word2vec_algorithm()

In [11]:
# 1.先训练word2vec的model
st_time = time.time()
word2vec_algorithm_obj.train(full_seg_words_path)
print('训练词向量共耗时: {}'.format(time.time()-st_time))

正在训练word2vec词向量，预料为: ../data/full_seg_words.csv，size为: 300
训练word2vec词向量完毕，已保存在: ../data/20w_size300_win100.model
训练词向量共耗时: 2709.0512042045593


In [14]:
# 2.利用model分别得到训练集和测试集的word2vec向量，使用求和平均
train_seg_words_data = get_data_from_csv(train_seg_words_path)
test_seg_words_data = get_data_from_csv(test_seg_words_path)

error:  [] 32620
error:  [] 87647
共读取了: 99998行
error:  [] 17478
error:  [] 19941
共读取了: 99998行


In [None]:
train_avgvec = word2vec_algorithm_obj.load_model_and_transform_sentence_to_avgvec(train_seg_words_data)
test_avgvec = word2vec_algorithm_obj.load_model_and_transform_sentence_to_avgvec(test_seg_words_data)

In [27]:
np.save('../data/wv300_win100.train.npy', train_avgvec)
np.save('../data/wv300_win100.test.npy', test_avgvec)

In [38]:
# 3.测试wv向量，使用word2vec向量进行比赛，目的找到最好的word2vec参数
# 载入label文件，获取标签数据
gender_label_path = '../data/train_gender.csv'
age_label_path = '../data/train_age.csv'
education_label_path = '../data/train_education.csv'

gender_label = np.loadtxt(open(gender_label_path, 'r')).astype(int)
age_label = np.loadtxt(open(age_label_path, 'r')).astype(int)
education_label = np.loadtxt(open(education_label_path, 'r')).astype(int)

In [55]:
from utils import remove_zero

# 缺失值处理
wv_gender_x, gender_label = remove_zero(train_avgvec, gender_label)
wv_age_x, age_label = remove_zero(train_avgvec, age_label)
wv_education_x, education_label = remove_zero(train_avgvec, education_label)

In [102]:
# 使用2-fold进行验证，获取验证集的平均准确率，然后统计三个结果的加和平均准确率
res1 = word2vec_algorithm_obj.validation(wv_gender_x, gender_label, kind='gender')
res2 = word2vec_algorithm_obj.validation(wv_age_x, age_label, kind='age')
res3 = word2vec_algorithm_obj.validation(wv_education_x, education_label, kind='education')

print('平均准确度是: ',(res1+res2+res3)/3.0)

validating  gender
1-fold
正在使用Logistic训练模型...
2-fold
正在使用Logistic训练模型...
[0.82243526 0.82639712] 0.8244161886752852
validating  age
1-fold
正在使用Logistic训练模型...
2-fold
正在使用Logistic训练模型...
[0.55013423 0.55446854] 0.5523013844072397
validating  education
1-fold
正在使用Logistic训练模型...
2-fold
正在使用Logistic训练模型...
[0.54809197 0.5531868 ] 0.5506393859658881
平均准确度是:  0.6424523196828044
