In [1]:
import pandas as pd

porn=pd.read_csv('../data/porn_1to7.csv',header=None,index_col=None)
nomal=pd.read_csv('../data/nomal_1to7.csv',header=None,index_col=None)


In [2]:
len(nomal)+len(porn)

14425

In [3]:
import numpy as np

combined = np.concatenate((nomal[0], porn[0]))
combined.shape

(14425,)

In [4]:
# pos -> 1; neu -> 0
y = np.concatenate((np.ones(len(nomal), dtype=int), np.zeros(len(porn), dtype=int)))
y.shape

(14425,)

In [5]:
import jieba

#对句子经行分词，并去掉换行符
def tokenizer(text):
    ''' Simple Parser converting each document to lower-case, then
        removing the breaks for new lines and finally splitting on the
        whitespace
    '''
    text = [jieba.lcut(document.replace('\n', '')) for document in text]
    return text

combined = tokenizer(combined)

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\84920\AppData\Local\Temp\jieba.cache
Loading model cost 0.709 seconds.
Prefix dict has been built succesfully.


In [6]:
len(combined)

14425

In [19]:
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence
import multiprocessing

cpu_count = multiprocessing.cpu_count() # 4
vocab_dim = 30
n_iterations = 10  # ideally more..
n_exposures = 1 # 所有频数超过1的词语
window_size = 7
n_epoch = 4
input_length = 100
maxlen = 100

def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过1的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过1的词语的词向量, (word->model(word))

        def parse_dataset(combined): # 闭包-->临时使用
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) # freqxiao10->0
                data.append(new_txt)
            return data # word=>index
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
        return w2indx, w2vec,combined
    else:
        print ('No data provided...')

In [8]:
#创建词语字典，并返回每个词语的索引，词向量，以及每个句子所对应的词语索引
def word2vec_train(combined):

    model = Word2Vec(size=vocab_dim,
                     min_count=1,
                     window=window_size,
                     workers=cpu_count,
                     iter=n_iterations,
                     )
    model.build_vocab(combined) # input: list
    model.train(combined,epochs=20, total_examples=model.corpus_count)
    model.save('../model/Word2vec_model.pkl')
    index_dict, word_vectors,combined = create_dictionaries(model=model,combined=combined)
    return   index_dict, word_vectors,combined

In [9]:
print ("Training a Word2vec model...")
index_dict, word_vectors,combined=word2vec_train(combined)

Training a Word2vec model...




In [10]:
from  sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers.embeddings import Embedding
from keras.layers.recurrent import LSTM
from keras.layers.core import Dense, Dropout,Activation
from keras.models import model_from_yaml
np.random.seed(1337)  # For Reproducibility
import sys
sys.setrecursionlimit(1000000)
import yaml
import keras

batch_size = 32

In [11]:
def get_data(index_dict,word_vectors,combined,y):

    n_symbols = len(index_dict) + 1  # 所有单词的索引数，频数小于10的词语索引为0，所以加1
    embedding_weights = np.zeros((n_symbols, vocab_dim)) # 初始化 索引为0的词语，词向量全为0
    for word, index in index_dict.items(): # 从索引为1的词语开始，对每个词语对应其词向量
        embedding_weights[index, :] = word_vectors[word]
    x_train, x_test, y_train, y_test = train_test_split(combined, y, test_size=0.2)
    y_train= np.asarray(y_train)
    y_test= np.asarray(y_test)
#     y_train = keras.utils.to_categorical(y_train,num_classes=1) 
#     y_test = keras.utils.to_categorical(y_test,num_classes=1)
    # print x_train.shape,y_train.shape
    return n_symbols,embedding_weights,x_train,y_train,x_test,y_test


In [17]:
##定义网络结构
def train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test):
    print ('Defining a Simple Keras Model...')
    model = Sequential()  # or Graph or whatever
    model.add(Embedding(output_dim=vocab_dim,
                        input_dim=n_symbols,
                        mask_zero=True,
                        weights=[embedding_weights],
                        input_length=input_length))  # Adding Input Length
#     model.add(LSTM(output_dim=50, activation='tanh', inner_activation='hard_sigmoid'))
#     model.add(Dropout(0.5))
#     model.add(Dense(32,activation="relu"))
#     model.add(Dense(1, activation='sigmoid'))
    model.add(LSTM(32))
    model.add(Dense(1,activation="sigmoid"))
    
    
    print ('Compiling the Model...')
    model.compile(loss='binary_crossentropy',
                  optimizer='rmsprop',metrics=['accuracy'])#1:adam

    print ("Train...") # batch_size=32
    model.fit(x_train, y_train, batch_size=batch_size, epochs=n_epoch,verbose=1)

    print ("Evaluate...")
    score = model.evaluate(x_test, y_test,
                                batch_size=batch_size)

    yaml_string = model.to_yaml()
    with open('../model/lstm.yml', 'w') as outfile:
        outfile.write( yaml.dump(yaml_string, default_flow_style=True) )
    model.save_weights('../model/lstm.h5')
    print ('Test score:', score)


In [20]:
print ('Setting up Arrays for Keras Embedding Layer...')
n_symbols,embedding_weights,x_train,y_train,x_test,y_test=get_data(index_dict, word_vectors,combined,y)
print ("x_train.shape and y_train.shape:")
print (x_train.shape,y_train.shape)
train_lstm(n_symbols,embedding_weights,x_train,y_train,x_test,y_test)

Setting up Arrays for Keras Embedding Layer...
x_train.shape and y_train.shape:
(11540, 100) (11540,)
Defining a Simple Keras Model...
Compiling the Model...
Train...
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4
Evaluate...
Test score: [0.1230595647978824, 0.9653379321098328]


In [14]:
"""
预测
"""
import jieba
import numpy as np
from gensim.models.word2vec import Word2Vec
from gensim.corpora.dictionary import Dictionary
from keras.preprocessing import sequence

import yaml
from keras.models import model_from_yaml
np.random.seed(1337)  # For Reproducibility
import sys
sys.setrecursionlimit(1000000)

# define parameters
maxlen = 100

def create_dictionaries(model=None,
                        combined=None):
    ''' Function does are number of Jobs:
        1- Creates a word to index mapping
        2- Creates a word to vector mapping
        3- Transforms the Training and Testing Dictionaries

    '''
    if (combined is not None) and (model is not None):
        gensim_dict = Dictionary()
        gensim_dict.doc2bow(model.wv.vocab.keys(),
                            allow_update=True)
        #  freqxiao10->0 所以k+1
        w2indx = {v: k+1 for k, v in gensim_dict.items()}#所有频数超过10的词语的索引,(k->v)=>(v->k)
        w2vec = {word: model[word] for word in w2indx.keys()}#所有频数超过10的词语的词向量, (word->model(word))

        def parse_dataset(combined): # 闭包-->临时使用
            ''' Words become integers
            '''
            data=[]
            for sentence in combined:
                new_txt = []
                for word in sentence:
                    try:
                        new_txt.append(w2indx[word])
                    except:
                        new_txt.append(0) # freqxiao10->0
                data.append(new_txt)
            return data # word=>index
        combined=parse_dataset(combined)
        combined= sequence.pad_sequences(combined, maxlen=maxlen)#每个句子所含词语对应的索引，所以句子中含有频数小于10的词语，索引为0
        return w2indx, w2vec,combined
    else:
        print ('No data provided...')


def input_transform(string):
    words=jieba.lcut(string)
    words=np.array(words).reshape(1,-1)
    model=Word2Vec.load('../model/Word2vec_model.pkl')
    _,_,combined=create_dictionaries(model,words)
    return combined


def lstm_predict(string):
    with open('../model/lstm.yml', 'r') as f:
        yaml_string = yaml.load(f)
    model = model_from_yaml(yaml_string)

    model.load_weights('../model/lstm.h5')
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',metrics=['accuracy'])
    data=input_transform(string)
    data.reshape(1,-1)
    #print data
    result=model.predict_classes(data)
    print (result) #返回的结果
    
    if result[0]==1:
        print (string,' 自然')
    elif result[0]==0:
        print (string,' 色情')
    else:
        print ("??")

In [15]:
string = "干杯"
lstm_predict(string)



[[1]]
干杯  自然


In [18]:
string = "哔哩哔哩动画"
lstm_predict(string)



[[1]]
哔哩哔哩动画  自然
