In [10]:
from __future__ import print_function

def LatexMatrix(matrix):
    ltx = r'\left[\begin{array}'
    m, n = matrix.shape
    ltx += '{' + ("r" * n) + '}'
    for i in range(m):
        ltx += r" & ".join([('%.4f' % j.item()) for j in matrix[i]]) + r" \\ "
    ltx += r'\end{array}\right]'
    return ltx

def hMx(theta, X):
    return X*theta

def JMx(theta,X,y):
    m = len(y)
    J = np.sqrt(1.0/m*((X*theta-y).T*(X*theta-y)))
    return J.item()

def dJMx(theta,X,y):
    return (X.T*(X*theta-y))/(len(y)*np.sqrt(len(y)*(X*theta-y).T*(X*theta-y)))

def norm(X,y):
    return (X.T*X)**-1*X.T*y

def GDMx(fJ, fdJ, theta, X, y, alpha=0.1, eps=10**-3):
    errorCurr = fJ(theta, X, y)
    errors = [[errorCurr, theta]]
    step=0
    while True:
        step+=1
        theta = theta - alpha * fdJ(theta, X, y) # implementacja wzoru
        errorCurr, errorPrev = fJ(theta, X, y), errorCurr
        if abs(errorPrev - errorCurr) <= eps:
            break
        if step>10000:
            break
        errors.append([errorCurr, theta]) 
    return theta, errors

def ASGD(fJ, fdJ, theta, X, y, alpha=0.001, maxEpochs=1.0, batchSize=100, adaGrad=False, logError=False):
    m, n = X.shape
    ghistory=np.matrix(np.zeros(n)).reshape(n,1)
    start, end = 0, batchSize
    maxSteps = (m * float(maxEpochs)) / batchSize
    errorCurr = fJ(theta, X, y)
    errors = [[errorCurr, theta]]
    for i in range(int(maxSteps)):
        XBatch, yBatch =  X[start:end,:], y[start:end,:]
        g=fdJ(theta, XBatch, yBatch)
        if adaGrad:
            ghistory+=np.multiply(g,g)
            g2=np.multiply(1.0/np.sqrt(ghistory + 10**-6),g)
            theta = theta - alpha * g2
        else:
            theta = theta - alpha*g
        if start + batchSize < m:
            start += batchSize
        else:
            start = 0
            IND= np.random.permutation(m)
            X=X[IND]
            y=y[IND]
        end = min(start + batchSize, m)
        errorCurr, errorPrev = fJ(theta, X, y), errorCurr
        errors.append([errorCurr, theta]) 
    return theta, errors

In [11]:
import csv
import matplotlib as mpl
from mpl_toolkits.mplot3d import Axes3D
import numpy as np
import matplotlib.pyplot as plt

%matplotlib inline

from IPython.display import display, Math, Latex
#jak beda chinskie znaczki to problem w open albo readerze
reader = csv.reader(open("train.tsv", 'r', encoding='utf-8'), delimiter='\t')
all_features=[]
txtfeatures=[]
for cena,rooms,metr,pietro,adres,opis in reader:
    all_features.append([float(cena),int(rooms),float(metr),int(pietro)])
    txtfeatures.append([adres,opis])

reader = csv.reader(open("devin.tsv", 'r', encoding='utf-8'), delimiter='\t')
test_features=[]
test_txtfeatures=[]
for rooms,metr,pietro,adres,opis in reader:
    test_features.append([int(rooms),float(metr),int(pietro)])
    test_txtfeatures.append([adres,opis])
    
reader = csv.reader(open("devexpected.tsv", 'r', encoding='utf-8'), delimiter='\t')
test_expected=[]
for cena in reader:
    test_expected.append(float(cena[0]))

"""reader = csv.reader(open("gratka2.txt", 'r', encoding='utf-8'), delimiter=',')
labels=next(reader)
for _,cena,rooms,metr,pietro,adres,opis in reader:
    all_features.append([float(cena)/1000.0,int(rooms),float(metr),int(pietro)])
    txtfeatures.append([adres,opis])"""

all_features=(np.array(all_features))
txtfeatures=(np.array(txtfeatures))
test_txtfeatures=np.array(test_txtfeatures)
test_features=(np.array(test_features))


In [12]:
#ODSZUMIANIE
ceny=all_features[:,0]
lpokoi=all_features[:,1]
metraz=all_features[:,2]
pietro=all_features[:,3]

#ŚREDNIE
mcena=ceny.mean(axis=0)
mrooms=lpokoi.mean(axis=0)
mmeters=metraz.mean(axis=0)
mfloor=pietro.mean(axis=0)

#STD
stdcena=ceny.std(axis=0)
stdrooms=lpokoi.std(axis=0)
stdmeters=metraz.std(axis=0)
stdfloor=pietro.std(axis=0)

tnum=3

features_new=txtfeatures
features_new=np.insert(features_new,0,ceny.astype(float),axis=1)
features_new=np.insert(features_new,1,lpokoi,axis=1)
features_new=np.insert(features_new,2,metraz,axis=1)
features_new=np.insert(features_new,3,pietro,axis=1)

features_new=features_new[features_new[:,0].astype(float)<mcena+tnum*stdcena]
features_new=features_new[features_new[:,1].astype(float)<mrooms+tnum*stdrooms]
features_new=features_new[features_new[:,2].astype(float)<mmeters+tnum*stdmeters]
features_new=features_new[features_new[:,3].astype(float)<mfloor+tnum*stdfloor]


features_new=features_new[features_new[:,0].astype(float)>mcena-tnum*stdcena]
features_new=features_new[features_new[:,1].astype(float)>mrooms-tnum*stdrooms]
features_new=features_new[features_new[:,2].astype(float)>mmeters-tnum*stdmeters]
features_new=features_new[features_new[:,3].astype(float)>mfloor-tnum*stdfloor]

In [13]:
test_matrix=(np.matrix(test_features.astype(float)))
test_expected=(np.matrix(test_expected))
s1,s2=test_matrix.shape
XTx = np.matrix(np.concatenate((np.ones((s1, 1)), test_matrix.reshape(s1,s2)), axis=1)).reshape(s1,s2+1)

strening=np.matrix(features_new[:,:4].astype(float))
s1,s2=strening.shape
XMx = np.matrix(np.concatenate((np.ones((s1, 1)), strening[:,1:s2].reshape(s1,s2-1)), axis=1)).reshape(s1,s2)
yMx = np.matrix(strening[:,0]).reshape(s1,1)

In [14]:
thetaNorm = norm(XMx, yMx)
display(Math(r'\Large \theta = ' + LatexMatrix(thetaNorm)))
display(Math(r'\Large J(\theta) = %.4f' % JMx(thetaNorm, XMx, yMx)))

<IPython.core.display.Math object>

<IPython.core.display.Math object>

## Nowe parametry

In [15]:
#0-jedynki 1-lpokoi 2-metraz 3-pietro 4-metraz_lpokoi 5-lpokoi_pietro 6-sqrPOW

#metraz_pietro=np.divide(features_1[:,2],features_1[:,3])

TXMx= np.concatenate((XMx, np.divide(XMx[:,1],XMx[:,3])), axis=1)
TXMx= np.concatenate((TXMx, np.divide(np.sqrt(XMx[:,2]),XMx[:,1])), axis=1)
#TXMx= np.concatenate((TXMx, np.sqrt(XMx[:,2])), axis=1)
TXTx= np.concatenate((XTx, np.divide(XTx[:,0],XTx[:,2])), axis=1)
TXTx= np.concatenate((TXTx, np.divide(np.sqrt(XTx[:,1]),XTx[:,0])), axis=1)
#TXTx= np.concatenate((TXTx, np.sqrt(XTx[:,1])), axis=1)
#print (TXMx[:10])

In [16]:
thetaStartMx = np.zeros(TXMx.shape[1]).reshape(TXMx.shape[1],1)
thetaBestMx1, errors1=ASGD(JMx, dJMx, thetaStartMx, TXMx, yMx, alpha=1, maxEpochs=15, batchSize=50, adaGrad=True, logError=False)
display(Math(r'\large\textrm{Wynik z Adagrad:}\quad \theta = ' + 
            LatexMatrix(thetaBestMx1) + 
             (r' \quad J(\theta) = %.4f' % errors1[-1][0])  
             + r' \quad \textrm{po %d iteracjach}' % len(errors1)))

<IPython.core.display.Math object>

In [17]:
thetaNorm = norm(TXMx[:,[0,1,2,3,4,5]], yMx)
display(Math(r'\Large \theta = ' + LatexMatrix(thetaNorm)))
display(Math(r'\Large J(\theta) = %.4f' % JMx(thetaNorm, TXMx[:,[0,1,2,3,4,5]], yMx)))



<IPython.core.display.Math object>

<IPython.core.display.Math object>

# TESTY


In [18]:
import re
def poldel(t):
    t=re.sub(r'Ż|Ź|ź|ż',r'z',t)
    t=re.sub(r'Ą|ą',r'a',t)
    t=re.sub(r'Ć|ć',r'c',t)
    t=re.sub(r'Ę|ę',r'e',t)
    t=re.sub(r'Ł|ł',r'l',t)
    t=re.sub(r'Ń|ń',r'n',t)
    t=re.sub(r'Ó|ó',r'o',t)
    t=re.sub(r'Ś|ś',r's',t)
    t_nopl=t.split()
    return t
kat1 = {

    'podolany': '2', 
    'umultowo': '3',
   'radojewo': '6', 
    'morasko': '6',
    'strzeszyn': '6',  
    'anotoninek': '2',
    'kiekrz': '2',
    'krzyzowniki': '2',
    'smochowice': '2',
    'szczepankowo': '2',
    'kwiatowe': '2',
    'fabianowo': '2',
    'kotowo': '2',
    'swierczewo': '2',
    'gluszyna': '2',
    'krzesiny': '2',
    'pokrzywno': '2',
    'garaszewo': '2',
    'splawie': '2',
    'krzesinki': '2',
    'zieliniec': '2',
    'kobylepole': '2',
}

kat2 = {
     'marysienki': '8',
        'piatkowo': '7',
    'sobieskiego': '8',
     'ostrow tumski': '10',  
        'naramowice': '1',
        'rataje': '6',  
    'grunwald': '6',
     'winiary': '6',
    'chartowo': '2',
    'warszawskie': '2',
    'pomet': '2',
    'maltanskie': '2',
    'zawady': '2',
    'srodka': '2',
    'glowna': '2',
    'winogrady': '7',
     'stare miasto': '9',
    'lazarz': '6',
    'staroleka': '0',
    'minikowo': '1',
     'debiec': '2',
     'wola': '2',
    'anotoninek': '2',
     'chartowo': '3',
     'gorczyn': '3',
    'ogrody': '3',
       'wilda': '4',
     'lawica': '4',
    'solacz': '2',
     'zegrze': '4',
     'swierczewo': '6',
    'nowe miasto': '6',

}

#NOWE DANE DLA ZESTAWU TRENUJĄCEGO
dzielkat1=[]
dzielkat2=[]
kawalerka1=[]
#INDEKS 5 7
for row in features_new:
    n_row5=poldel(row[4].lower())
    n_row7=poldel(row[5].lower())
    for keys in kat1:
        match=0
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat1.append(1)
            match=1
            break;
    if (match==0):
            dzielkat1.append(0)
    if (re.search("kawaler",n_row7)):
        kawalerka1.append(-1)
    else:
        kawalerka1.append(2)
            
    match=0
   
    for keys in kat2:
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat2.append(1)
            match=1
            break;
    if (match==0):
            dzielkat2.append(0)
    match=0
      
dzielkat1=(np.array(dzielkat1))
dzielkat2=(np.array(dzielkat2))

#NOWE DANE DLA ZESTAWU TESTOWEGO
dzielkat3=[]
dzielkat4=[]
kawalerka2=[]
#INDEKS 5 7
for row in test_txtfeatures:
    n_row5=poldel(row[0].lower())
    n_row7=poldel(row[1].lower())
    for keys in kat1:
        match=0
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat3.append(1)
            match=1
            break;
    if (match==0):
            dzielkat3.append(0)
    if (re.search("kawaler",n_row7)):
        kawalerka2.append(-1)
    else:
        kawalerka2.append(2)
    match=0
   
    for keys in kat2:
        if (re.search(keys,n_row7) or re.search(keys,n_row5)):
            dzielkat4.append(1)
            match=1
            break;
    if (match==0):
            dzielkat4.append(0)
    match=0
      
dzielkat3=(np.array(dzielkat3))
dzielkat4=(np.array(dzielkat4))

In [19]:

nTXMx=np.insert(TXMx,6,dzielkat1,axis=1)
nTXMx=np.insert(nTXMx,7,dzielkat2,axis=1)
nTXMx=np.insert(nTXMx,8,kawalerka1,axis=1)

nTXTx=np.insert(TXTx,6,dzielkat3,axis=1)
nTXTx=np.insert(nTXTx,7,dzielkat4,axis=1)                 
nTXTx=np.insert(nTXTx,8,kawalerka2,axis=1)                 


In [20]:
thetaNorm = norm(nTXMx[:,[0,1,2,3,4,5,6,7,8]], yMx)
display(Math(r'\Large \theta = ' + LatexMatrix(thetaNorm)))
display(Math(r'\Large J(\theta) = %.4f' % JMx(thetaNorm, nTXMx[:,[0,1,2,3,4,5,6,7,8]], yMx)))

result=hMx(thetaNorm,nTXTx)
with open ("expected.tsv","w") as f:
    for i in range(len(result)):
        print (result[i,0], file=f)

<IPython.core.display.Math object>

<IPython.core.display.Math object>