In [259]:
from __future__ import print_function

def LatexMatrix(matrix):
    ltx = r'\left[\begin{array}'
    m, n = matrix.shape
    ltx += '{' + ("r" * n) + '}'
    for i in range(m):
        ltx += r" & ".join([('%.4f' % j.item()) for j in matrix[i]]) + r" \\ "
    ltx += r'\end{array}\right]'
    return ltx

def hMx(theta, X):
    return X*theta

def JMx(theta,X,y):
    m = len(y)
    J = np.sqrt(1.0/m*((X*theta-y).T*(X*theta-y)))
    return J.item()

def dJMx(theta,X,y):
    return (X.T*(X*theta-y))/(len(y)*np.sqrt(len(y)*(X*theta-y).T*(X*theta-y)))

def norm(X,y):
    return (X.T*X)**-1*X.T*y

def GDMx(fJ, fdJ, theta, X, y, alpha=0.1, eps=10**-3):
    errorCurr = fJ(theta, X, y)
    errors = [[errorCurr, theta]]
    step=0
    while True:
        step+=1
        theta = theta - alpha * fdJ(theta, X, y) # implementacja wzoru
        errorCurr, errorPrev = fJ(theta, X, y), errorCurr
        if abs(errorPrev - errorCurr) <= eps:
            break
        if step>10000:
            break
        errors.append([errorCurr, theta]) 
    return theta, errors

def ASGD(fJ, fdJ, theta, X, y, alpha=0.001, maxEpochs=1.0, batchSize=100, adaGrad=False, logError=False):
    m, n = X.shape
    ghistory=np.matrix(np.zeros(n)).reshape(n,1)
    start, end = 0, batchSize
    maxSteps = (m * float(maxEpochs)) / batchSize
    errorCurr = fJ(theta, X, y)
    errors = [[errorCurr, theta]]
    for i in range(int(maxSteps)):
        XBatch, yBatch =  X[start:end,:], y[start:end,:]
        g=fdJ(theta, XBatch, yBatch)
        if adaGrad:
            ghistory+=np.multiply(g,g)
            g2=np.multiply(1.0/np.sqrt(ghistory + 10**-6),g)
            theta = theta - alpha * g2
        else:
            theta = theta - alpha*g
        if start + batchSize < m:
            start += batchSize
        else:
            start = 0
            IND= np.random.permutation(m)
            X=X[IND]
            y=y[IND]
        end = min(start + batchSize, m)
        errorCurr, errorPrev = fJ(theta, X, y), errorCurr
        errors.append([errorCurr, theta]) 
    return theta, errors

In [260]:
import csv
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from IPython.display import display, Math, Latex
#jak beda chinskie znaczki to problem w open albo readerze
reader = csv.reader(open("train.tsv", 'r', encoding='utf-8'), delimiter='\t')
all_features=[]
txtfeatures=[]
for cena,rooms,metr,pietro,adres,opis in reader:
    all_features.append([float(cena),int(rooms),float(metr),int(pietro)])
    txtfeatures.append([adres,opis])

reader = csv.reader(open("devin.tsv", 'r', encoding='utf-8'), delimiter='\t')
test_features=[]
test_txtfeatures=[]
for rooms,metr,pietro,adres,opis in reader:
    test_features.append([int(rooms),float(metr),int(pietro)])
    test_txtfeatures.append([adres,opis])
    
reader = csv.reader(open("devexpected.tsv", 'r', encoding='utf-8'), delimiter='\t')
test_expected=[]
for cena in reader:
    test_expected.append(float(cena[0]))

"""reader = csv.reader(open("gratka2.txt", 'r', encoding='utf-8'), delimiter=',')
labels=next(reader)
for _,cena,rooms,metr,pietro,adres,opis in reader:
    all_features.append([float(cena)/1000.0,int(rooms),float(metr),int(pietro)])
    txtfeatures.append([adres,opis])"""

all_features=(np.array(all_features))
txtfeatures=(np.array(txtfeatures))
test_txtfeatures=np.array(test_txtfeatures)
test_features=(np.array(test_features))
test_expected=(np.array(test_expected))
#print (test_features[:5])

In [261]:
#ODSZUMIANIE
ceny=all_features[:,0]
lpokoi=all_features[:,1]
metraz=all_features[:,2]
pietro=all_features[:,3]

tceny=test_expected
tlpokoi=test_features[:,0]
tmetraz=test_features[:,1]
tpietro=test_features[:,2]

#ŚREDNIE
mcena=ceny.mean(axis=0)
mrooms=lpokoi.mean(axis=0)
mmeters=metraz.mean(axis=0)
mfloor=pietro.mean(axis=0)

tmcena=tceny.mean(axis=0)
tmrooms=tlpokoi.mean(axis=0)
tmmeters=tmetraz.mean(axis=0)
tmfloor=tpietro.mean(axis=0)



#STD
stdcena=ceny.std(axis=0)
stdrooms=lpokoi.std(axis=0)
stdmeters=metraz.std(axis=0)
stdfloor=pietro.std(axis=0)

tstdcena=tceny.std(axis=0)
tstdrooms=tlpokoi.std(axis=0)
tstdmeters=tmetraz.std(axis=0)
tstdfloor=tpietro.std(axis=0)

tnum=3

test_new=test_txtfeatures
test_new=np.insert(test_txtfeatures,0,tceny.astype(float),axis=1)
test_new=np.insert(test_new,1,tlpokoi,axis=1)
test_new=np.insert(test_new,2,tmetraz,axis=1)
test_new=np.insert(test_new,3,tpietro,axis=1)

print (test_new[:3])
test_new=test_new[test_new[:,0].astype(float)<tmcena+tnum*tstdcena]
test_new=test_new[test_new[:,1].astype(float)<tmrooms+tnum*tstdrooms]
test_new=test_new[test_new[:,2].astype(float)<tmmeters+tnum*tstdmeters]
test_new=test_new[test_new[:,3].astype(float)<tmfloor+tnum*tstdfloor]


test_new=test_new[test_new[:,0].astype(float)>tmcena-tnum*tstdcena]
test_new=test_new[test_new[:,1].astype(float)>tmrooms-tnum*tstdrooms]
test_new=test_new[test_new[:,2].astype(float)>tmmeters-tnum*tstdmeters]
test_new=test_new[test_new[:,3].astype(float)>tmfloor-tnum*tstdfloor]

features_new=txtfeatures
features_new=np.insert(features_new,0,ceny.astype(float),axis=1)
features_new=np.insert(features_new,1,lpokoi,axis=1)
features_new=np.insert(features_new,2,metraz,axis=1)
features_new=np.insert(features_new,3,pietro,axis=1)

features_new=features_new[features_new[:,0].astype(float)<mcena+tnum*stdcena]
features_new=features_new[features_new[:,1].astype(float)<mrooms+tnum*stdrooms]
features_new=features_new[features_new[:,2].astype(float)<mmeters+tnum*stdmeters]
features_new=features_new[features_new[:,3].astype(float)<mfloor+tnum*stdfloor]


features_new=features_new[features_new[:,0].astype(float)>mcena-tnum*stdcena]
features_new=features_new[features_new[:,1].astype(float)>mrooms-tnum*stdrooms]
features_new=features_new[features_new[:,2].astype(float)>mmeters-tnum*stdmeters]
features_new=features_new[features_new[:,3].astype(float)>mfloor-tnum*stdfloor]

print ('\nNowa długość zestawu trenującego',len(features_new))
print ('\nNowa długość zestawu trenującego',len(test_new))
# ODSZUMIANIE TESTOWYCH#
#test_features=test_new[:,[1,2,3]].astype(float)
#test_expected=test_new[:,[0]].astype(float)
#test_txtfeatures=test_new[:,[4,5]]

[['230.0' '1.0' '43.0' '3.0' 'Poznań ul. Polna'
  'KAMIENICA PO REWITALIZACJI Mieszkanie znajduje się w kamienicy z 1910r. na 3 piętrze Budynek 3 piętrowy. Stan techniczny lokalu - do wykończenia drewniana podłoga drzwi antywłamaniowe . Ogrzewanie gazowe. LOKALIZACJA: Mieszkanie znajduje...']
 ['495.0' '4.0' '129.7' '1.0' 'Poznań Jeżyce Strzeszyn ul. Owidiusza'
  '*** WYSOKI STANDARD *** DUŻA POWIERZCHNIA *** DOSTĘPNE OD RĘKI *** Przestronne czteropokojowe mieszkanie o powierzchni 130m2 w spokojnej okolicy Strzeszyna. Mieszkanie położone na parterze w dwupiętrowym budynku z cegły wybudowanym...']
 ['256.0' '2.0' '39.3' '1.0' 'Poznań Wilda ul. 28 Czerwca 1956 R.'
  'Jasne mieszkanie na pierwszym piętrze w kamienicy w Poznaniu w dzielnicy Wilda. Dwa pokoje z aneksem kuchennym okna wychodzące na zachód; widok na podwórko. Standard wykończenia: Ściany - gładź gipsowa; Podłogi - wylewka samopoziomująca; Okna...']]

Nowa długość zestawu trenującego 4311

Nowa długość zestawu trenującego 477


In [262]:
print (test_expected[:3])

[ 230.  495.  256.]


In [263]:
test_matrix=(np.matrix(test_features.astype(float)))
test_expected=(np.matrix(test_expected))
s1,s2=test_matrix.shape
yTx = test_expected.reshape(s1,1)
XTx = np.matrix(np.concatenate((np.ones((s1, 1)), test_matrix.reshape(s1,s2)), axis=1)).reshape(s1,s2+1)

strening=np.matrix(features_new[:,:4].astype(float))
s1,s2=strening.shape
print (s1,s2)
XMx = np.matrix(np.concatenate((np.ones((s1, 1)), strening[:,1:s2].reshape(s1,s2-1)), axis=1)).reshape(s1,s2)
yMx = np.matrix(strening[:,0]).reshape(s1,1)

print(XMx.shape, yMx.shape)
print (True in (XMx[:,3]==0))
print (yMx[:2])
print (yTx[:3])

4311 4
(4311, 4) (4311, 1)
False
[[ 269.]
 [ 320.]]
[[ 230.]
 [ 495.]
 [ 256.]]


In [264]:
# NORMALIZACJA
"""Vec_1_mean=np.mean(XMx[0:,1], axis=0)
Vec_2_mean=np.mean(XMx[0:,2], axis=0)
Vec_3_mean=np.mean(XMx[0:,3], axis=0)
Vec_1_std=np.std(XMx[0:,1], axis=0,dtype=float)
Vec_2_std=np.std(XMx[0:,2], axis=0,dtype=float)
Vec_3_std=np.std(XMx[0:,3], axis=0,dtype=float)
XMx[0:,1]=(XMx[0:,1]-Vec_1_mean)/Vec_1_std
XMx[0:,2]=(XMx[0:,2]-Vec_2_mean)/Vec_2_std
XMx[0:,3]=(XMx[0:,3]-Vec_3_mean)/Vec_3_std
"""

"""
M1=np.max(XMx[0:,1])
M2=np.max(XMx[0:,2])
M3=np.max(XMx[0:,3])

XMx[0:,1]=(XMx[0:,1])/(np.max(XMx[0:,1]))
XMx[0:,2]=(XMx[0:,2])/(np.max(XMx[0:,2]))
XMx[0:,3]=(XMx[0:,3])/(np.max(XMx[0:,3]))

XTx[0:,0]=(XTx[0:,0])/(np.max(XTx[0:,0]))
XTx[0:,1]=(XTx[0:,1])/(np.max(XTx[0:,1]))
XTx[0:,2]=(XTx[0:,2])/(np.max(XTx[0:,2]))

Eq=([1,M1,M2,M3])
def change_theta(theta,mult):
    for i in range(len(mult)):
        theta[i]=theta[i]*mult[i]
    return theta
print (XTx.shape, yTx.shape)
"""

'\nM1=np.max(XMx[0:,1])\nM2=np.max(XMx[0:,2])\nM3=np.max(XMx[0:,3])\n\nXMx[0:,1]=(XMx[0:,1])/(np.max(XMx[0:,1]))\nXMx[0:,2]=(XMx[0:,2])/(np.max(XMx[0:,2]))\nXMx[0:,3]=(XMx[0:,3])/(np.max(XMx[0:,3]))\n\nXTx[0:,0]=(XTx[0:,0])/(np.max(XTx[0:,0]))\nXTx[0:,1]=(XTx[0:,1])/(np.max(XTx[0:,1]))\nXTx[0:,2]=(XTx[0:,2])/(np.max(XTx[0:,2]))\n\nEq=([1,M1,M2,M3])\ndef change_theta(theta,mult):\n    for i in range(len(mult)):\n        theta[i]=theta[i]*mult[i]\n    return theta\nprint (XTx.shape, yTx.shape)\n'

In [265]:
thetaStartMx = np.matrix([0,0,0,0]).reshape(4,1)
thetaBestMx1, errors1 = GDMx(JMx, dJMx, thetaStartMx, 
                       XMx, yMx, alpha=0.1, eps=10**-6)

display(Math(r'\large\textrm{Wynik bez Adagrad:}\quad \theta = ' + 
            LatexMatrix(thetaBestMx1) + 
             (r' \quad J(\theta) = %.4f' % errors1[-1][0])  
             + r' \quad \textrm{po %d iteracjach}' % len(errors1)))
print (JMx(thetaBestMx1,XTx,yTx))
#print (hMx(thetaBestMx1,XTx))

<IPython.core.display.Math object>

1330.291987034057


In [266]:
thetaStartMx = np.matrix([0,0,0,0]).reshape(4,1)
thetaBestMx1, errors1=ASGD(JMx, dJMx, thetaStartMx, XMx, yMx, alpha=0.01, maxEpochs=50, batchSize=25, adaGrad=False, logError=False)
display(Math(r'\large\textrm{Wynik bez Adagrad z epokami:}\quad \theta = ' + 
            LatexMatrix(thetaBestMx1) + 
             (r' \quad J(\theta) = %.4f' % errors1[-1][0])  
             + r' \quad \textrm{po %d iteracjach}' % len(errors1)))
print (JMx(thetaBestMx1,XTx,yTx))

<IPython.core.display.Math object>

1330.3535750968813


In [267]:
thetaStartMx = np.matrix([0,0,0,0]).reshape(4,1)
thetaBestMx1, errors1=ASGD(JMx, dJMx, thetaStartMx, XMx, yMx, alpha=1, maxEpochs=50, batchSize=25, adaGrad=True, logError=False)
display(Math(r'\large\textrm{Wynik z Adagradem:}\quad \theta = ' + 
            LatexMatrix(thetaBestMx1) + 
             (r' \quad J(\theta) = %.4f' % errors1[-1][0])  
             + r' \quad \textrm{po %d iteracjach}' % len(errors1)))
print (JMx(thetaBestMx1,XTx,yTx))

<IPython.core.display.Math object>

1330.0221037759206


In [268]:
thetaNorm = norm(XMx, yMx)
display(Math(r'\Large \theta = ' + LatexMatrix(thetaNorm)))
display(Math(r'\Large J(\theta) = %.4f' % JMx(thetaNorm, XMx, yMx)))
print (JMx(thetaNorm,XTx,yTx))

<IPython.core.display.Math object>

<IPython.core.display.Math object>

1329.5244491257943


## Nowe parametry

In [269]:
#0-jedynki 1-lpokoi 2-metraz 3-pietro 4-metraz_lpokoi 5-lpokoi_pietro 6-sqrPOW

#metraz_pietro=np.divide(features_1[:,2],features_1[:,3])

TXMx= np.concatenate((XMx, np.divide(XMx[:,1],XMx[:,3])), axis=1)
TXMx= np.concatenate((TXMx, np.divide(np.sqrt(XMx[:,2]),XMx[:,1])), axis=1)
#TXMx= np.concatenate((TXMx, np.sqrt(XMx[:,2])), axis=1)
TXTx= np.concatenate((XTx, np.divide(XTx[:,0],XTx[:,2])), axis=1)
TXTx= np.concatenate((TXTx, np.divide(np.sqrt(XTx[:,1]),XTx[:,0])), axis=1)
#TXTx= np.concatenate((TXTx, np.sqrt(XTx[:,1])), axis=1)
#print (TXMx[:10])

In [270]:
thetaStartMx = np.zeros(TXMx.shape[1]).reshape(TXMx.shape[1],1)
thetaBestMx1, errors1=ASGD(JMx, dJMx, thetaStartMx, TXMx, yMx, alpha=1, maxEpochs=15, batchSize=50, adaGrad=True, logError=False)
display(Math(r'\large\textrm{Wynik z Adagrad:}\quad \theta = ' + 
            LatexMatrix(thetaBestMx1) + 
             (r' \quad J(\theta) = %.4f' % errors1[-1][0])  
             + r' \quad \textrm{po %d iteracjach}' % len(errors1)))
print (JMx(thetaBestMx1,TXTx,yTx))

<IPython.core.display.Math object>

1330.891467376201


In [271]:
thetaNorm = norm(TXMx[:,[0,1,2,3,4,5]], yMx)
display(Math(r'\Large \theta = ' + LatexMatrix(thetaNorm)))
display(Math(r'\Large J(\theta) = %.4f' % JMx(thetaNorm, TXMx[:,[0,1,2,3,4,5]], yMx)))

print (JMx(thetaNorm,TXTx,yTx))
result=hMx(thetaNorm,TXTx)
with open ("outpucik.tsv","w") as f:
    for i in range(len(result)):
        print (result[i,0], file=f)

<IPython.core.display.Math object>

<IPython.core.display.Math object>

1328.6483035690994


# TESTY


In [272]:
import re
def poldel(t):
    t=re.sub(r'Ż|Ź|ź|ż',r'z',t)
    t=re.sub(r'Ą|ą',r'a',t)
    t=re.sub(r'Ć|ć',r'c',t)
    t=re.sub(r'Ę|ę',r'e',t)
    t=re.sub(r'Ł|ł',r'l',t)
    t=re.sub(r'Ń|ń',r'n',t)
    t=re.sub(r'Ó|ó',r'o',t)
    t=re.sub(r'Ś|ś',r's',t)
    t_nopl=t.split()
    return t
kat1 = {

    'podolany': '2', 
    'umultowo': '3',
   'radojewo': '6', 
    'morasko': '6',
    'strzeszyn': '6',  
    'anotoninek': '2',
    'kiekrz': '2',
    'krzyzowniki': '2',
    'smochowice': '2',
    'szczepankowo': '2',
    'kwiatowe': '2',
    'fabianowo': '2',
    'kotowo': '2',
    'swierczewo': '2',
    'gluszyna': '2',
    'krzesiny': '2',
    'pokrzywno': '2',
    'garaszewo': '2',
    'splawie': '2',
    'krzesinki': '2',
    'zieliniec': '2',
    'kobylepole': '2',
}

kat2 = {
     'marysienki': '8',
        'piatkowo': '7',
    'sobieskiego': '8',
     'ostrow tumski': '10',  
        'naramowice': '1',
        'rataje': '6',  
    'grunwald': '6',
     'winiary': '6',
    'chartowo': '2',
    'warszawskie': '2',
    'pomet': '2',
    'maltanskie': '2',
    'zawady': '2',
    'srodka': '2',
    'glowna': '2',
    'winogrady': '7',
     'stare miasto': '9',
    'lazarz': '6',
    'staroleka': '0',
    'minikowo': '1',
     'debiec': '2',
     'wola': '2',
    'anotoninek': '2',
     'chartowo': '3',
     'gorczyn': '3',
    'ogrody': '3',
       'wilda': '4',
     'lawica': '4',
    'solacz': '2',
     'zegrze': '4',
     'swierczewo': '6',
    'nowe miasto': '6',

}

#NOWE DANE DLA ZESTAWU TRENUJĄCEGO
dzielkat1=[]
dzielkat2=[]
kawalerka1=[]
#INDEKS 5 7
for row in features_new:
    n_row5=poldel(row[4].lower())
    n_row7=poldel(row[5].lower())
    for keys in kat1:
        match=0
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat1.append(1)
            match=1
            break;
    if (match==0):
            dzielkat1.append(0)
    if (re.search("kawaler",n_row7)):
        kawalerka1.append(-1)
    else:
        kawalerka1.append(2)
            
    match=0
   
    for keys in kat2:
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat2.append(1)
            match=1
            break;
    if (match==0):
            dzielkat2.append(0)
    match=0
      
dzielkat1=(np.array(dzielkat1))
dzielkat2=(np.array(dzielkat2))

#NOWE DANE DLA ZESTAWU TESTOWEGO
dzielkat3=[]
dzielkat4=[]
kawalerka2=[]
#INDEKS 5 7
for row in test_txtfeatures:
    n_row5=poldel(row[0].lower())
    n_row7=poldel(row[1].lower())
    for keys in kat1:
        match=0
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat3.append(1)
            match=1
            break;
    if (match==0):
            dzielkat3.append(0)
    if (re.search("kawaler",n_row7)):
        kawalerka2.append(-1)
    else:
        kawalerka2.append(2)
    match=0
   
    for keys in kat2:
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat4.append(1)
            match=1
            break;
    if (match==0):
            dzielkat4.append(0)
    match=0
      
dzielkat3=(np.array(dzielkat3))
dzielkat4=(np.array(dzielkat4))
print (len(dzielkat4))
print (len(dzielkat1))
print (sum(kawalerka1))

500
4311
7785


In [273]:
print (TXMx.shape)
print (TXTx.shape)
nTXMx=np.insert(TXMx,6,dzielkat1,axis=1)
nTXMx=np.insert(nTXMx,7,dzielkat2,axis=1)
nTXMx=np.insert(nTXMx,8,kawalerka1,axis=1)
#TXMx= np.concatenate((TXMx, dzielkat1), axis=1)
#TXMx= np.concatenate((TXMx, dzielkat2), axis=1)

nTXTx=np.insert(TXTx,6,dzielkat3,axis=1)
nTXTx=np.insert(nTXTx,7,dzielkat4,axis=1)                 
nTXTx=np.insert(nTXTx,8,kawalerka2,axis=1)                 

#TXTx= np.concatenate((TXTx, dzielkat3), axis=1)
#TXTx= np.concatenate((TXTx, dzielkat4), axis=1)

print (nTXMx[:3])
print (nTXTx[:3])

(4311, 6)
(500, 6)
[[  1.           3.          55.           1.           3.           2.47206616
    0.           1.           2.        ]
 [  1.           3.          79.          10.           0.3          2.96273147
    0.           1.           2.        ]
 [  1.           1.          31.21         1.           1.           5.58659109
    0.           1.          -1.        ]]
[[  1.00000000e+00   1.00000000e+00   4.30000000e+01   3.00000000e+00
    2.32558140e-02   1.00000000e+00   0.00000000e+00   0.00000000e+00
    2.00000000e+00]
 [  1.00000000e+00   4.00000000e+00   1.29700000e+02   1.00000000e+00
    7.71010023e-03   2.00000000e+00   1.00000000e+00   0.00000000e+00
    2.00000000e+00]
 [  1.00000000e+00   2.00000000e+00   3.93000000e+01   1.00000000e+00
    2.54452926e-02   1.41421356e+00   0.00000000e+00   1.00000000e+00
    2.00000000e+00]]


In [275]:
thetaNorm = norm(nTXMx[:,[0,1,2,3,4,5,6,7,8]], yMx)
display(Math(r'\Large \theta = ' + LatexMatrix(thetaNorm)))
display(Math(r'\Large J(\theta) = %.4f' % JMx(thetaNorm, nTXMx[:,[0,1,2,3,4,5,6,7,8]], yMx)))

print (JMx(thetaNorm,nTXTx[:,[0,1,2,3,4,5,6,7,8]],yTx))

<IPython.core.display.Math object>

<IPython.core.display.Math object>

1328.6826437025984
