In [None]:
import wandb
from tqdm import tqdm
from collections import Counter
import torch
from torch.utils.data import DataLoader
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from pandas import DataFrame
from pprint import pprint
import codecs
import pandas as pd
wandb.login()

In [None]:
sweep_config={
    'method':'random'
}

In [None]:
import math

parameters_dict={
    'num_filter':{
        'values':[3,4,5]
    },
    'optimizer':{
        'values':['adam','sgd']
    },
    'dropout':{
        'values':[0.3,0.4,0.5]
    },
    'learning_rate':{
        'distribution':'uniform',
        'min':0,
        'max':0.1
    },
    'batch_size':{
        'distribution':'q_log_uniform',
        'q':1,
        'min':math.log(32),
        'max':math.log(256),
    }
}

sweep_config['parameters']=parameters_dict

In [None]:
parameters_dict.update({
    'epochs':{
        'value':1
    }
})

In [None]:

pprint(sweep_config)

In [None]:
sweep_id=wandb.sweep(sweep_config,project="pytorch-sweeps-demo")
wandb.init()

In [None]:

path_data_file='./datasets/toutiao_cat_data.txt'
text=[]
label=[]
id=[]
with codecs.open(path_data_file,'r') as f:
    for line in f.readlines():
        line=line.strip()
        if not line or len(line)<1:
            continue
        contents=line.split("_!_")
        label.append(contents[2])
        id.append(contents[0])
        text.append("".join(contents[3:]))
dict_origin={
    "id":id,
    "text":text,
    "label":label
}
df_origin=pd.DataFrame(dict_origin,columns=["id","text","label"])

In [None]:
df_origin[:6]

In [None]:
value_counts=df_origin['label'].value_counts()

In [None]:
value_counts.to_frame()

In [None]:
##处理词频等
words="".join(df_origin['text'])

In [None]:
words[:100]

In [None]:
##文本清理
import re 
re_obj=re.compile(r"[!\"#$%&'()*+,-./;:<=>?@[\\\]^_~`{|}-——……！，。？、’“‘”；：￥（）【】《》\s]+")
def clear(text):
    return re_obj.sub('',text)
words=clear(words)
words[:100]

In [None]:
words[len(words)-1]

In [None]:
import gc
gc.collect()

In [None]:
vocab=[w for w,f in Counter(words).most_common() if f >1]
vocab=['<pad>','<unk>']+vocab
index2word={i:word for i,word in enumerate(vocab)}
word2index={word:i for i,word in enumerate(vocab)}

In [None]:
index2word[3]

In [None]:
max_length=512
def sentence_2_ids(sentence:str):
    unk_id=word2index['<unk>']
    pad_id=word2index['<pad>']
    if len(sentence)>max_length:
        sentence=sentence[-max_length:]
    indexes=[word2index.get(word,unk_id) for word in sentence]
    if len(indexes)<max_length:
        indexes.extend([pad_id]*(max_length-len(indexes)))
    if len(indexes)>max_length:
        indexes=indexes[-max_length:]
    if len(indexes)!=512:
        print(indexes)
    return indexes
#返回句子的数字向量

In [None]:
import torch 
class DatasetTextCNN(torch.utils.data.Dataset):
    def __init__(self,encodings,labels=None):
        self.encodings=encodings
        self.labels=labels
    def __getitem__(self,idx):
        return self.encodings[idx],self.labels[idx]
    def __len__(self):
        return len(self.encodings)

In [None]:
le=LabelEncoder()
le.fit(label)

In [None]:
type(le)

In [None]:
#dataframe转为textcnn的数据类型  (文本编码，标签)
def df_2_dataset_textcnn(df:DataFrame):
    x=list(df["text"])
    df["label_id"]=le.transform(df["label"].tolist())
    y=list(df["label_id"])
    text=[]
    for element in x:
        words=sentence_2_ids(element)
        text.append(words)
    result=DatasetTextCNN(text,y)
    return result

In [None]:
df_train,df_test=train_test_split(df_origin,test_size=0.1,stratify=df_origin[['label']])
df_train,df_eval=train_test_split(df_train,test_size=0.1,stratify=df_train[['label']])

df_train,_=train_test_split(df_train,test_size=0.8,stratify=df_train[['label']])

pprint(df_train.shape)
pprint(df_test.shape)
pprint(df_eval.shape)

In [None]:
dataset_train=df_2_dataset_textcnn(df_train)
dataset_eval=df_2_dataset_textcnn(df_eval)
dataset_test=df_2_dataset_textcnn(df_test)

In [None]:
len(dataset_train)

In [None]:
def collate_fn(batch):
    text,label=zip(*batch)
    new_text=torch.LongTensor(text)
    new_label=torch.LongTensor(label)
    return new_text,new_label

In [None]:
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

use_cuda=True if torch.cuda.is_available() else False
MAX_LENGTH=512
FILTERS=[2,3,4,5]
NUM_LABEL=value_counts.shape[0]

In [None]:
use_cuda