In [7]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import time
import os

In [8]:
df_cities = pd.read_csv('../input/cities.csv')
#10% of dataframe
df_cities.sample(frac=0.1, replace = True)

Unnamed: 0,CityId,X,Y
274,274,1600.065726,153.160078
43122,43122,4748.169680,1791.062446
84675,84675,2852.917993,1708.800912
183328,183328,2770.071102,1577.182519
111391,111391,4435.063592,1977.188900
68379,68379,4335.457497,1215.880608
191273,191273,1595.510094,429.257945
85111,85111,394.234461,2661.257154
176635,176635,5021.183833,3086.613485
11966,11966,400.738398,1393.952284


In [9]:
# source = https://www.kaggle.com/seshadrikolluri/understanding-the-problem-and-some-sample-paths
# To improve the performance, instead of checking whether each member is a prime, 
# we first a generate a list where each element tells whether the number indicated 
# by the position is a prime or not. 

# using sieve of eratosthenes
start = time.time()
def sieve_of_eratosthenes(n):
    primes = [True for i in range(n+1)] # Start assuming all numbers are primes
    primes[0] = False # 0 is not a prime
    primes[1] = False # 1 is not a prime
    for i in range(2,int(np.sqrt(n)) + 1):
        if primes[i]:
            k = 2
            while i*k <= n:
                primes[i*k] = False
                k += 1
    return(primes)
prime_cities = sieve_of_eratosthenes(max(df_cities.CityId))
end = time.time()

The data structure used in this algorithm is a list.<br>

This function does not have a linear run time as it contains a while loop nested in a for loop<br>
At worst, it will run through n elements n times O(n^2)<br>
List comprehension runs n times with 2 operations (assignment, sequence access)<br>
2 primitive operations are run (assignment)<br>
For loop runs n times <br>
If statement runs 1 time with 1 primitive operations inside it<br>
A while loop runs n times with 2 operations



In [10]:
#Algorithm run time
print(end - start)

0.10069680213928223


In [None]:
#https://www.kaggle.com/seshadrikolluri/understanding-the-problem-and-some-sample-paths
start = time.time()
def total_distance(dfcity,path):
    prev_city = path[0]
    total_distance = 0
    step_num = 1
    for city_num in path[1:]:
        next_city = city_num
        total_distance = total_distance + \
            np.sqrt(pow((dfcity.X[city_num] - dfcity.X[prev_city]),2) + pow((dfcity.Y[city_num] - dfcity.Y[prev_city]),2)) * \
            (1+ 0.1*((step_num % 10 == 0)*int(not(prime_cities[prev_city]))))
        prev_city = next_city
        step_num = step_num + 1
    return total_distance

dumbest_path = list(df_cities.CityId[:].append(pd.Series([0])))
print('Total distance with the dumbest path is '+ "{:,}".format(total_distance(df_cities,dumbest_path)))
end = time.time()

The data structure used in this algorithm is a list.<br>

This function is linear however the worst operation is when the square root function of numpy is accessed<br>
There are 4 primitive operations before the for loop (assignment and sequence access)<br>
The for loops runs for n amount of time<br>
The numpy package is then accessed, adding to the run time<br>
From there there are 2 operations<br>
After the for loop there is assignment and sequence access

In [None]:
#Algorithm run time
print(end - start)