In [None]:
import matplotlib.pyplot as plt
import matplotlib
import pandas as pd, numpy as np
matplotlib.rcParams[u'font.sans-serif'] = ['simhei']
matplotlib.rcParams['axes.unicode_minus'] = False

In [None]:
def get_data3(ratio = 0.6):
    data = pd.read_csv('./data3.csv')
    data.drop(axis=1,columns=['编号'],inplace=True)
    data[['好瓜']] = data[['好瓜']].apply(lambda x: 1.0 if (x == '是').bool() else 0.0, axis=1)
    
    discrete_fea = data.columns[data.dtypes == 'object']
    for fea in discrete_fea:
        onehot_fea = pd.get_dummies(data[[fea]], dtype='float')
        data.drop(fea, axis=1, inplace=True)
        data = pd.concat([onehot_fea, data], axis=1)
        
#     data = data.sample(frac = 1).reset_index(drop=True)
    data_posi = data[data['好瓜'] == 1].sample(frac = 1).reset_index(drop=True)
    data_nega = data[data['好瓜'] == 0].sample(frac = 1).reset_index(drop=True)
    data_train = data_posi[:int(len(data_posi)*ratio)].append(data_nega[:int(len(data_nega)*ratio)]).reset_index(drop=True)
    data_test = data_posi[int(len(data_posi)*ratio):].append(data_nega[int(len(data_nega)*ratio):]).reset_index(drop=True)
    
    return (data_train, data_test)

In [None]:
import random, math

def idxing(x):
    if x == 'Iris-setosa':
        return 2
    elif x == 'Iris-versicolor':
        return 0
    else:
        return 1

def get_data_iris(ratio = 0.7):
    data = pd.read_csv('./data_iris.csv', names=['sepal_l', 'sepal_w', 'petal_l', 'petal_w', 'type'])

    data['type'] = data['type'].apply(idxing)
    data = data[data.type != 2]

    data_bar = np.apply_along_axis(axis=0,arr=data,func1d=np.median)

    data = data.sample(frac = 1).reset_index(drop=True)
    data_posi = data[data.type == 1].sample(frac = 1).reset_index(drop=True)
    data_nega = data[data.type == 0].sample(frac = 1).reset_index(drop=True)
    data_train = data_posi[:int(len(data_posi)*ratio)].append(data_nega[:int(len(data_nega)*ratio)]).reset_index(drop=True)
    data_test = data_posi[int(len(data_posi)*ratio):].append(data_nega[int(len(data_nega)*ratio):]).reset_index(drop=True)
    
    return (data_train, data_test)

In [None]:
def sigmoid(x):
    return 1.0 / (np.exp(-x) + 1.0)

In [None]:
class NN(object):
    def __init__(self, data_pair, by_whole = False, lamda = 2e-8):
        (self.data_train, self.data_test) = data_pair
        self.by_whole = by_whole
        self.lamda = lamda ## ratio of non-regular item
        self.d = self.data_train.shape[1] - 1
        self.q = self.d
        self.l = 1
        self.data = [ np.zeros(self.d), np.zeros(self.q), np.zeros(self.l) ] ## [x, b, y]
        self.weight = [ np.random.random((self.d, self.q)), np.random.random((self.q, self.l)) ] ## [v, w]
        self.threshold = [ np.random.random(self.q), np.random.random(self.l) ] ## [gamma, theta]
        self.clear()
        
    def clear(self):
        self.delta_w = np.zeros((self.q, self.l))
        self.delta_theta = np.float64(0.0)
        self.delta_v = np.zeros((self.d, self.q))
        self.delta_gamma = np.zeros(self.q)
    
    def update(self, n):
        ## regular item added
        self.weight[1]+=(1-self.lamda)*self.delta_w/float(n) - self.lamda*(self.weight[1]).sum()
        self.threshold[1]+=(1-self.lamda)*self.delta_theta/float(n) - self.lamda*(self.threshold[1]).sum()
        self.weight[0]+=(1-self.lamda)*self.delta_v/float(n) - self.lamda*(self.weight[0]).sum()
        self.threshold[0]+=(1-self.lamda)*self.delta_gamma/float(n) - self.lamda*(self.threshold[0]).sum()
        self.clear()
        
    ## input: idx of data_train
    ## output: (y^, y)
    def forward(self, sample):
        sample = sample.values
        self.data[0] = sample[:-1]
        self.data[1] = sigmoid(np.dot(self.weight[0].T, self.data[0]) - self.threshold[0])
        self.data[2] = sigmoid(np.dot(self.weight[1].T, self.data[1]) - self.threshold[1])
        (_y, y) = self.data[2][0], sample[-1]
        return (_y, y)
    
    ## input: (y^, y)
    def bp(self, _y, y, alpha = 4.0, beta = 1.0, to_update = True):  
#     def bp(self, _y, y, alpha = 12.0, beta = 3.0, to_update = True):    
        g = _y * (1.0 - _y) * (y - _y) ## (1, )
        e = self.data[1] * (1.0 - self.data[1]) * np.dot(self.weight[1].T, self.data[1]) ## (q, )

        self.delta_w += beta * g * self.data[1].reshape(self.delta_w.shape[0], 1) ## (q, )
        self.delta_theta += -beta * g ## (1, )
        self.delta_v += alpha * np.outer(self.data[0], e) ## (d, q)
        self.delta_gamma += -alpha * e ## (q, 1)
        if to_update: self.update(1)
        
    def fit(self, iteration = 100000):
        
        for T in range(iteration + 1):
            if T % 1000 == 0: print ('iter {}:'.format(T))
            
            ## data_train
            acc, n = 0.0, self.data_train.shape[0]
            for i in range(n):
                (_y, y) = self.forward(self.data_train.iloc[i])
                acc += (_y < 0.5 and y == 0.0) or (_y >= 0.5 and y == 1.0)
                self.bp(_y, y, self.by_whole)
            if T % 1000 == 0: print ('train_accuracy: {}'.format(acc / float(n)))
            if self.by_whole == True: self.update(n)

            ## data_test
            acc, n = 0.0, self.data_test.shape[0]
            if n == 0: continue
            for i in range(n):
                (_y, y) = self.forward(self.data_test.iloc[i])
                acc += (_y < 0.5 and y == 0.0) or (_y >= 0.5 and y == 1.0)
            if T % 1000 == 0: print ('test_accuracy: {}'.format(acc / float(n)))

In [None]:
## of data3

In [None]:
nn = NN(data_pair=get_data3(), by_whole=False, lamda=1e-3)
nn.fit()

In [None]:
## of data iris

In [None]:
nn2 = NN(data_pair=get_data_iris(), by_whole=False, lamda=1e-5)
nn2.fit()

In [None]:
## XOR Problem

In [None]:
def Dis(x, c):
    return np.square(x - c).sum()
def Rou(x, c, beta):
    return np.exp(-beta * Dis(x, c))

q = 10
data = np.array([[0,0,0],[0,1,1],[1,0,1],[1,1,0]], dtype='float64')

beta = np.zeros(q, dtype='float64')
w = np.random.random(q)

delta_beta = np.zeros(q, dtype='float64')
delta_w = np.random.random(q); delta_w /= delta_w.sum() 

c = np.random.random((q, 2))
rou = np.zeros(q, dtype='float64')
dis = np.zeros(q, dtype='float64')
iteration = 100000
alpha = 1.0

for T in range(iteration + 1):
    print ('Iter {}:'.format(T))
    acc, n = 0.0, data.shape[0]
    for i in range(n):
        ## forward
        sample = data[i]
        feature, label = sample[:-1], sample[-1]
        for j in range(q):
            rou[j] = Rou(feature, c[j], beta[j])
            dis[j] = Dis(feature, c[j])

        phi = np.dot(w, rou)
        
        ## bp
        delta_w = -(phi - label) * rou
        delta_beta = (phi - label) * w * rou * dis
        
        w += alpha * delta_w
        beta += alpha * delta_beta
        
        print ('{:.3f} {:.1f}'.format(phi, label))
    print ('')

## Scrawling Capchas

In [2]:
# from selenium import webdriver
# from selenium.webdriver.common.by import By
# from selenium.webdriver.support.ui import WebDriverWait
# from selenium.webdriver.support import expected_conditions as EC

# chrome_options = webdriver.ChromeOptions()
# # chrome_options.add_argument('--headless')
# driver = webdriver.Chrome(chrome_options=chrome_options)

import requests

url = 'http://elite.nju.edu.cn/jiaowu/ValidateCode.jsp'
headers = {
    'User-Agent':'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'
}

i = 0
while i < 100000:
    try:
        html = requests.get(url, timeout = 1, headers = headers)
        html.encoding = 'utf-8'
        with open('./verification-decoder/result/img_{}'.format(i), 'ab') as f:
            f.write(html.content)
            f.close()
        i = i + 1
    except Exception as err:
        print("Error {}".format(err))

Error HTTPConnectionPool(host='elite.nju.edu.cn', port=80): Max retries exceeded with url: /jiaowu/ValidateCode.jsp (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7f77367b28d0>, 'Connection to elite.nju.edu.cn timed out. (connect timeout=1)'))
Error HTTPConnectionPool(host='elite.nju.edu.cn', port=80): Max retries exceeded with url: /jiaowu/ValidateCode.jsp (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7f77367b2ef0>, 'Connection to elite.nju.edu.cn timed out. (connect timeout=1)'))
Error HTTPConnectionPool(host='elite.nju.edu.cn', port=80): Max retries exceeded with url: /jiaowu/ValidateCode.jsp (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConnection object at 0x7f77367b2c50>, 'Connection to elite.nju.edu.cn timed out. (connect timeout=1)'))
Error HTTPConnectionPool(host='elite.nju.edu.cn', port=80): Max retries exceeded with url: /jiaowu/ValidateCode.jsp (Caused by ConnectTimeoutError(<urllib3.connection.HTTPConn

KeyboardInterrupt: 