**Copy the folder to your google drive, using the code below to mount your google drive.**

In [1]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/My\ Drive
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/My Drive
'API Document.gdoc'  'Deep Learning'   new_assessment	    Template
 Autoencoder.ipynb    GAN	       NLP_classification   Training
'Colab Notebooks'     models	       Res40000_157	    Untitled
 data		      myToolKit        Res40000_5	    无标题文档.gdoc


In [0]:
import pandas as pd
import re
import jieba
import numpy as np
from gensim.models import Word2Vec

**Data path**

In [0]:
data_path = 'NLP_classification/data/all_valid_data.csv'
labels_path = 'NLP_classification/data/all_labels.txt'
stop_words = 'NLP_classification/data/baidu+chuanda.txt'

**Load the label dictionary and the stopword list**

In [0]:
labels = {}
file = open(labels_path,'r', encoding='gb18030')
for line in file:
    labels[line.split(',')[0]] = int(line.split(',')[1])

In [0]:
stopwords = [line[0:-1] for line in open(stop_words, 'r', encoding='utf-8').readlines()]


**split data**

In [0]:
dataset = pd.read_csv(data_path, encoding = 'gb18030')
dataset = dataset.rename(columns={'诉求内容':'Description', '处置单位':'Department'})
from sklearn.model_selection import train_test_split

'''extract 20% of the dataset as the testset. Using random state of 42'''

x_train, x_test, y_train, y_test = train_test_split(dataset['Description'], dataset['Department'], test_size=0.2, random_state=42)

'''due to the limit of memory, we take 40,000 data samples first from the trainset'''

_, x_40000, _, y_40000 = train_test_split(x_train, y_train, test_size = 40000, random_state=24)


**Load Word2Vec model**

In [0]:
w2v_model = Word2Vec.load('NLP_classification/models/CBOW.model')

**Take the first 100 words of a description, convert them to 100*100 features**

In [0]:
def map1(data):
  data = re.sub('市民来电咨询', '', data)
  data = re.sub('市民来电反映', '', data)
  data = re.sub("[\s+\.\!\/_,$%^*(+\"\']+|[a-zA-Z0-9+——！，。？、~@#￥%……&*（）《》：:]+", "", data)
  words = jieba.cut(data)
  vector = np.array([])
  for word in words:

    if word in stopwords:
        continue
    try:
        vec = w2v_model[word]
        vector = np.append(vector, vec)
        if (vector.shape[0] == 10000):
            break
    except Exception:
        continue

  if (vector.shape[0] < 10000):
      pendding = np.zeros(10000 - vector.shape[0])
      vector = np.append(vector, pendding)
  return vector.reshape([100,100])


**This step will take about 2 mins**

In [9]:
x_40000 = x_40000.map(map1)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.695 seconds.
Prefix dict has been built succesfully.
  if sys.path[0] == '':


In [10]:
x_40000.head()

346879    [[-3.03849458694458, -5.231710910797119, 1.353...
451239    [[1.030799388885498, -3.083348035812378, 1.234...
360550    [[-1.8268845081329346, 1.4179006814956665, -3....
296853    [[-2.067160129547119, -1.3425666093826294, 0.9...
530001    [[-0.3960898518562317, 0.5878874659538269, -5....
Name: Description, dtype: object

**Convert department name to a numerical label, can also use 'pd.get_dummies' instead**

In [0]:
y_40000 = y_40000.map(lambda x:labels[x])
  

In [12]:
y_40000.head()

346879    113
451239     71
360550     53
296853     21
530001      8
Name: Department, dtype: int64