In [37]:
from random import random,randint
import math
def wineprice(rating,age):
    
    peak_age=rating-50
    
    # Calculate price based on rating
    price=rating/2
    if age>peak_age:
        
        # Past its peak, goes bad in 5 years
        price=price*(5-(age-peak_age))
    else:
        
        # Increases to 5x original value as it
        # approaches its peak
        price=price*(5*((age+1)/peak_age))
    if price<0: price=0
    return price

In [38]:
def wineset1( ):
    rows=[]
    for i in range(300):
        # Create a random age and rating
        rating=random( )*50+50
        age=random( )*50
        # Get reference price
        price=wineprice(rating,age)
        # Add some noise
        price*=(random( )*0.4+0.8)
        # Add to the dataset
        rows.append({'input':(rating,age),'result':price})
    return rows

In [39]:
wineprice(95.0,3.0)

21.111111111111114

In [40]:
wineprice(95.0,8.0)

47.5

In [41]:
wineprice(99.0,1.0)

10.102040816326529

In [42]:
data=wineset1( )

In [43]:
data[0]

{'input': (93.20622143767764, 0.6986392498119998),
 'result': 10.896157894845828}

In [44]:
def euclidean(v1,v2):
    d=0.0
    for i in range(len(v1)):
        d+=(v1[i]-v2[i])**2
    return math.sqrt(d)

In [45]:
euclidean(data[0]['input'],data[1]['input'])

48.87881646180093

In [46]:
def getdistances(data,vec1):
    distancelist=[]
    for i in range(len(data)):
        vec2=data[i]['input']
        distancelist.append((euclidean(vec1,vec2),i))
    distancelist.sort( )
    return distancelist

In [47]:
def knnestimate(data,vec1,k=3):
    # Get sorted distances
    dlist=getdistances(data,vec1)
    avg=0.0
    # Take the average of the top k results
    for i in range(k):
        idx=dlist[i][1]
        avg+=data[idx]['result']
    avg=avg/k
    return avg

In [48]:
knnestimate(data,(95.0,3.0))

19.331261836651183

In [49]:
def inverseweight(dist,num=1.0,const=0.1):
    return num/(dist+const)

In [50]:
def subtractweight(dist,const=1.0):
    if dist>const:
        return 0
    else:
        return const-dist

In [51]:
def gaussian(dist,sigma=1.0):
    return math.e**(-dist**2/(2*sigma**2))

In [52]:
gaussian(3.0)

0.01110899653824231

In [53]:
def weightedknn(data,vec1,k=5,weightf=gaussian):
    
    # Get distances
    dlist=getdistances(data,vec1)
    avg=0.0
    totalweight=0.0
    
    # Get weighted average
    for i in range(k):
        dist=dlist[i][0]
        idx=dlist[i][1]
        weight=weightf(dist)
        avg+=weight*data[idx]['result']
        totalweight+=weight
    avg=avg/totalweight
    return avg

In [54]:
weightedknn(data,(99.0,5.0))

24.572437208431477

In [55]:
def dividedata(data,test=0.05):
    trainset=[]
    testset=[]
    for row in data:
        if random()<test:
            testset.append(row)
        else:
            trainset.append(row)
    return trainset,testset

In [56]:
def testalgorithm(algf,trainset,testset):
    error=0.0
    for row in testset:
        guess=algf(trainset,row['input'])
        error+=(row['result']-guess)**2
    return error/len(testset)

In [57]:
def crossvalidate(algf,data,trials=100,test=0.05):
    error=0.0
    for i in range(trials):
        trainset,testset=dividedata(data,test)
        error+=testalgorithm(algf,trainset,testset)
    return error/trials

In [58]:
crossvalidate(knnestimate,data)

523.7454039769893

In [59]:
def knn3(d,v): return knnestimate(d,v,k=3)
crossvalidate(knn3,data)

604.702049554983

In [60]:
def knn1(d,v): return knnestimate(d,v,k=1)
crossvalidate(knn1,data)

914.097551697671

In [61]:
crossvalidate(weightedknn,data)

682.9692145529377

In [62]:
def knninverse(d,v):
    return weightedknn(d,v,weightf=inverseweight)
crossvalidate(knninverse,data)

581.068739644397

In [63]:
def wineset2( ):
    rows=[]
    for i in range(300):
        # Create a random age and rating
        rating=random( )*50+50
        age=random( )*50
        aisle=float(randint(1,20))
        bottlesize=[375.0,750.0,1500.0,3000.0][randint(0,3)]
        # Get reference price
        price=wineprice(rating,age)
        # Add some noise
        price*=(bottlesize/750)
        price*=(random( )*0.4+0.8)
        # Add to the dataset
        rows.append({'input':(rating,age,aisle,bottlesize),'result':price})
    return rows

In [64]:
data=wineset2( )

In [65]:
crossvalidate(weightedknn,data)

10331.532952451158

In [66]:
crossvalidate(weightedknn,data)

10414.827446599724

In [67]:
def rescale(data,scale):
    scaleddata=[]
    for row in data:
        scaled=[scale[i]*row['input'][i] for i in range(len(scale))]
        scaleddata.append({'input':scaled,'result':row['result']})
    return scaleddata

In [69]:
sdata=rescale(data,[10,10,0,0.5])
crossvalidate(knn3,sdata)

5812.117322807933

In [70]:
crossvalidate(weightedknn,sdata)

ZeroDivisionError: float division by zero

In [71]:
def createcostfunction(algf,data):
    def costf(scale):
        sdata=rescale(data,scale)
        return crossvalidate(algf,sdata,trials=10)
    return costf

In [72]:
weightdomain=[(0,20)]*4

In [73]:
import random as ran
def annealingoptimize(domain,costf,T=10000.0,cool=0.95,step=1):
    
    # Initialize the values randomly
    vec=[float(ran.randint(domain[i][0],domain[i][1])) for i in range(len(domain))]
    
    while T>0.1:

        # Choose one of the indices
        i=ran.randint(0,len(domain)-1)

        # Choose a direction to change it
        dir=ran.randint(-step,step)

        # Create a new list with one of the values changed
        vecb=vec[:]
        vecb[i]+=dir
        if vecb[i]<domain[i][0]: vecb[i]=domain[i][0]
        elif vecb[i]>domain[i][1]: vecb[i]=domain[i][1]

        # Calculate the current cost and the new cost 
        ea=costf(vec)
        eb=costf(vecb)
        p=pow(math.e,(-eb-ea)/T)

        # Is it better, or does it make the probability # cutoff?
        if (eb<ea or ran.random()<p):
            vec=vecb

        # Decrease the temperature
        T=T*cool
    
    vec = [int(n) for n in vec]
    return vec

In [74]:
costf=createcostfunction(knnestimate,data)
annealingoptimize(weightdomain,costf,step=2)

[19, 7, 0, 8]

In [75]:
def wineset3( ):
    rows=wineset1( )
    for row in rows:
        if random( )<0.5:
            # Wine was bought at a discount store
            row['result']*=0.6
    return rows

In [76]:
data=wineset3( )

In [77]:
wineprice(99.0,20.0)

106.07142857142857

In [79]:
weightedknn(data,[99.0,20.0])

90.41713433451625

In [80]:
crossvalidate(weightedknn,data)

777.5550458181597

In [81]:
def probguess(data,vec1,low,high,k=5,weightf=gaussian):
    dlist=getdistances(data,vec1)
    nweight=0.0
    tweight=0.0
    
    for i in range(k):
        dist=dlist[i][0]
        idx=dlist[i][1]
        weight=weightf(dist)
        v=data[idx]['result']
        
        # Is this point in the range?
        if v>=low and v<=high:
            nweight+=weight
        tweight+=weight
    if tweight==0: return 0
    
    # The probability is the weights in the range
    # divided by all the weights
    return nweight/tweight

In [82]:
probguess(data,[99,20],40,80)

0.03681970822169888

In [83]:
probguess(data,[99,20],80,120)

0.9631802917783011

In [84]:
probguess(data,[99,20],120,1000)

0.0

In [85]:
probguess(data,[99,20],30,120)

1.0

In [86]:
def cumulativegraph(data,vec1,high,k=5,weightf=gaussian):
    t1=arange(0.0,high,0.1)
    cprob=array([probguess(data,vec1,0,v,k,weightf) for v in t1])
    plot(t1,cprob)
    show( )

In [89]:
cumulativegraph(data,(1,1),6)

In [91]:
def probabilitygraph(data,vec1,high,k=5,weightf=gaussian,ss=5.0):
    
    # Make a range for the prices
    t1=arange(0.0,high,0.1)

    # Get the probabilities for the entire range
    probs=[probguess(data,vec1,v,v+0.1,k,weightf) for v in t1]

    # Smooth them by adding the gaussian of the nearby probabilites
    smoothed=[]
    for i in range(len(probs)):
        sv=0.0
        for j in range(0,len(probs)):
            dist=abs(i-j)*0.1
            weight=gaussian(dist,sigma=ss)
            sv+=weight*probs[j]
        smoothed.append(sv)
    smoothed=array(smoothed)
    plot(t1,smoothed)
    show( )

In [92]:
probabilitygraph(data,(1,1),6)