## Data Collecting and Processing

### Venues new Moscow subway stations

Data is collected as following:
1. I found the list of subway stations and their locations in Wikipedia
2. I used Foursquare API to get the most common venues of given borough of Moscow

In [1]:
from bs4 import BeautifulSoup
import requests
import pandas as pd

In [2]:
source = requests.get('https://en.wikipedia.org/wiki/List_of_Moscow_Metro_stations').text
soup=BeautifulSoup(source,'lxml')

In [3]:
soup_ = soup.find_all('tbody')
td_list = soup_[1].find_all('td')

In [4]:
tr_list=soup_[1].find_all('tr')
print (tr_list[1].get_text(' ').split('\n')[3].strip())
print (tr_list[1].get_text(' ').split('\n')[15].split('/')[0].split(' ')[1].strip())
print (tr_list[1].get_text(' ').split('\n')[15].split('/')[0].split(' ')[4].strip())

Bulvar Rokossovskogo
55°48′53″N
37°44′03″E


In [5]:
import re

In [6]:
metro=[]
for tr in tr_list[1:]:
    name = tr.get_text(' ').split('\n')[3].strip()
    #print (name)
    lat_ = tr.get_text(' ').split('\n')[15].split('/')[0].split(' ')[1].strip()
    #print (lat_)
    lon_ = tr.get_text(' ').split('\n')[15].split('/')[0].split(' ')[4].strip()
    #print (lon_)
    
    deg, minutes, seconds, direction =  re.split('[°\′″]', lat_)
    lat2=(float(deg) + float(minutes)/60 + float(seconds)/(60*60)) * (-1 if direction in ['W', 'S'] else 1)
    
    deg, minutes, seconds, direction =  re.split('[°\′″]', lon_)
    lon2=(float(deg) + float(minutes)/60 + float(seconds)/(60*60)) * (-1 if direction in ['W', 'S'] else 1)
    
   
    metro.append([name, lat_, lon_, lat2, lon2])
    

In [7]:
df= pd.DataFrame(metro,columns=['metro', 'lat','lon','lat2','lon2'])

In [8]:
df.head()

Unnamed: 0,metro,lat,lon,lat2,lon2
0,Bulvar Rokossovskogo,55°48′53″N,37°44′03″E,55.814722,37.734167
1,Cherkizovskaya,55°48′14″N,37°44′41″E,55.803889,37.744722
2,Preobrazhenskaya Ploshchad,55°47′47″N,37°42′54″E,55.796389,37.715
3,Sokolniki,55°47′20″N,37°40′49″E,55.788889,37.680278
4,Krasnoselskaya,55°46′48″N,37°40′02″E,55.78,37.667222


In [9]:
df.shape

(261, 5)

In [10]:
latitude=df['lat2'][0]
longitude = df['lon2'][0] 

In [11]:
import folium

In [12]:
map_metro = folium.Map(location=[latitude, longitude], zoom_start=10)

# add markers to map
for lat, lng, metro in zip(df['lat2'], df['lon2'], df['metro']):
    label = '{}'.format(metro)
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_metro)  
    
map_metro

## Data Analysis

In [13]:
CLIENT_ID = 'Z25GUICIFLP0Q3Y4SCOWGQI13UMLKHAAGO5E3QCAJPWE41QE' # your Foursquare ID
CLIENT_SECRET = 'O4ELSZ33RPCYJS3DGNDYAWTVA3QM5RP3ZGHA2JDALWZCKSH2' # your Foursquare Secret
VERSION = '20180605' # Foursquare API version
LIMIT =30

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)


Your credentails:
CLIENT_ID: Z25GUICIFLP0Q3Y4SCOWGQI13UMLKHAAGO5E3QCAJPWE41QE
CLIENT_SECRET:O4ELSZ33RPCYJS3DGNDYAWTVA3QM5RP3ZGHA2JDALWZCKSH2


In [14]:
import requests # library to handle requests
from pandas.io.json import json_normalize

In [15]:
def getNearbyVenues(names, latitudes, longitudes, radius=750):
    
    venues_list=[]
    for name, lat, lng in zip(names, latitudes, longitudes):
        print(name)
            
        # create the API request URL
        url = 'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)
            
        # make the GET request
        results = requests.get(url).json()["response"]['groups'][0]['items']
        
        # return only relevant information for each nearby venue
        venues_list.append([(
            name, 
            lat, 
            lng, 
            v['venue']['name'], 
            v['venue']['location']['lat'], 
            v['venue']['location']['lng'],  
            v['venue']['categories'][0]['name'],
            v['reasons']['items'][0]['summary']) for v in results])

    nearby_venues = pd.DataFrame([item for venue_list in venues_list for item in venue_list])
    nearby_venues.columns = ['Neighborhood', 
                  'Neighborhood Latitude', 
                  'Neighborhood Longitude', 
                  'Venue', 
                  'Venue Latitude', 
                  'Venue Longitude', 
                  'Venue Category',
                  'Summary']
    
    return(nearby_venues)

In [16]:
moscow_venues = getNearbyVenues(names=df['metro'],
                                   latitudes=df['lat2'],
                                   longitudes=df['lon2'])

Bulvar Rokossovskogo
Cherkizovskaya
Preobrazhenskaya Ploshchad
Sokolniki
Krasnoselskaya
Komsomolskaya
Krasnye Vorota
Chistyye Prudy
Lubyanka
Okhotny Ryad
Biblioteka Imeni Lenina
Kropotkinskaya
Park Kultury
Frunzenskaya
Sportivnaya
Vorobyovy Gory
Universitet
Prospekt Vernadskogo
Yugo-Zapadnaya
Troparyovo
Rumyantsevo
Salaryevo
Khovrino
Belomorskaya
Rechnoy Vokzal
Vodny Stadion
Voykovskaya
Sokol
Begovaya
Dinamo
Belorusskaya
Mayakovskaya
Tverskaya
Teatralnaya
Novokuznetskaya
Paveletskaya
Avtozavodskaya
Tekhnopark
Kolomenskaya
Kashirskaya
Kantemirovskaya
Tsaritsyno
Orekhovo
Domodedovskaya
Krasnogvardeyskaya
Begovaya
Pyatnitskoye Shosse
Mitino
Volokolamskaya
Myakinino
Strogino
Krylatskoye
Molodyozhnaya
Kuntsevskaya
Slavyansky Bulvar
Park Pobedy
Kiyevskaya
Smolenskaya
Arbatskaya
Ploshchad Revolyutsii
Kurskaya
Begovaya
Elektrozavodskaya
Semyonovskaya
Partizanskaya
Izmaylovskaya
Pervomayskaya
Shchyolkovskaya
Kuntsevskaya
Pionerskaya
Filyovsky Park
Bagrationovskaya
Fili
Kutuzovskaya
Studencheska

In [17]:
print(moscow_venues.shape)
moscow_venues.head()

(7144, 8)


Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Summary
0,Bulvar Rokossovskogo,55.814722,37.734167,Подружка,55.814523,37.736781,Cosmetics Shop,This spot is popular
1,Bulvar Rokossovskogo,55.814722,37.734167,Burger King,55.814026,37.733659,Fast Food Restaurant,This spot is popular
2,Bulvar Rokossovskogo,55.814722,37.734167,Fitlab studio,55.810845,37.727964,Gym / Fitness Center,This spot is popular
3,Bulvar Rokossovskogo,55.814722,37.734167,Галерея Богородское,55.813944,37.734298,Art Gallery,This spot is popular
4,Bulvar Rokossovskogo,55.814722,37.734167,Ветеринарная Аптека Ветлек,55.813464,37.735036,Pet Store,This spot is popular


In [18]:
print('There are {} uniques categories.'.format(len(moscow_venues['Venue Category'].unique())))

There are 359 uniques categories.


In [19]:
moscow_venues['Venue Category'].unique()

array(['Cosmetics Shop', 'Fast Food Restaurant', 'Gym / Fitness Center',
       'Art Gallery', 'Pet Store', 'Vietnamese Restaurant',
       'Auto Workshop', 'Candy Store', 'Mobile Phone Shop', 'Park',
       'Café', 'Butcher', 'Tex-Mex Restaurant', 'Sushi Restaurant',
       'Asian Restaurant', 'Shoe Store', 'Coffee Shop', 'Bookstore',
       'Bus Line', 'Gift Shop', 'Dance Studio', 'Supermarket',
       'Convenience Store', 'Bus Stop', 'Tennis Stadium', 'Museum',
       'Arcade', 'Soccer Stadium', 'Sports Bar', 'Sporting Goods Shop',
       'Martial Arts Dojo', 'Flower Shop', 'Soccer Field', 'Hockey Arena',
       'Performing Arts Venue', 'Hookah Bar', 'Arts & Crafts Store',
       'Hotel', 'Trail', 'Pizza Place', 'Hobby Shop', 'Stadium',
       'Photography Lab', 'Playground', 'Health Food Store', 'Gym',
       'Farmers Market', 'Boxing Gym', 'Salon / Barbershop', 'Pharmacy',
       'Sandwich Place', 'Eastern European Restaurant', 'Burger Joint',
       'Breakfast Spot', 'Steakhouse'

In [20]:
moscow_venues['Venue Category'].value_counts().head()

Coffee Shop             347
Gym / Fitness Center    213
Cosmetics Shop          198
Park                    190
Supermarket             170
Name: Venue Category, dtype: int64

In [21]:
moscow_venues[moscow_venues['Venue Category']=='Coffee Shop'].head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Summary
16,Bulvar Rokossovskogo,55.814722,37.734167,Культ Кофе,55.813573,37.734943,Coffee Shop,This spot is popular
60,Preobrazhenskaya Ploshchad,55.796389,37.715,Starbucks,55.795299,37.712556,Coffee Shop,This spot is popular
62,Preobrazhenskaya Ploshchad,55.796389,37.715,Coffee Port,55.795361,37.712711,Coffee Shop,This spot is popular
80,Preobrazhenskaya Ploshchad,55.796389,37.715,Правда Кофе,55.795795,37.712796,Coffee Shop,This spot is popular
116,Krasnoselskaya,55.78,37.667222,Шоколадница,55.77972,37.666591,Coffee Shop,This spot is popular


In [22]:
moscow_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category,Summary
0,Bulvar Rokossovskogo,55.814722,37.734167,Подружка,55.814523,37.736781,Cosmetics Shop,This spot is popular
1,Bulvar Rokossovskogo,55.814722,37.734167,Burger King,55.814026,37.733659,Fast Food Restaurant,This spot is popular
2,Bulvar Rokossovskogo,55.814722,37.734167,Fitlab studio,55.810845,37.727964,Gym / Fitness Center,This spot is popular
3,Bulvar Rokossovskogo,55.814722,37.734167,Галерея Богородское,55.813944,37.734298,Art Gallery,This spot is popular
4,Bulvar Rokossovskogo,55.814722,37.734167,Ветеринарная Аптека Ветлек,55.813464,37.735036,Pet Store,This spot is popular


In [23]:
moscow_venues=moscow_venues.drop(columns=['Summary'])

In [24]:
moscow_venues.head()

Unnamed: 0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
0,Bulvar Rokossovskogo,55.814722,37.734167,Подружка,55.814523,37.736781,Cosmetics Shop
1,Bulvar Rokossovskogo,55.814722,37.734167,Burger King,55.814026,37.733659,Fast Food Restaurant
2,Bulvar Rokossovskogo,55.814722,37.734167,Fitlab studio,55.810845,37.727964,Gym / Fitness Center
3,Bulvar Rokossovskogo,55.814722,37.734167,Галерея Богородское,55.813944,37.734298,Art Gallery
4,Bulvar Rokossovskogo,55.814722,37.734167,Ветеринарная Аптека Ветлек,55.813464,37.735036,Pet Store


In [25]:
import csv

In [26]:
#moscow_venues.to_csv(r'C:\Users\Anton\Dropbox\Sargy\#Education\#COURSERA\IBM\moscow_venues_1.csv')

In [27]:
moscow_data = pd.read_csv(r'C:\Users\Anton\Dropbox\Sargy\#Education\#COURSERA\IBM\moscow_venues_1.csv', index_col='Index_Col')

In [28]:
moscow_data.head()

Unnamed: 0_level_0,Neighborhood,Neighborhood Latitude,Neighborhood Longitude,Venue,Venue Latitude,Venue Longitude,Venue Category
Index_Col,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,Bulvar Rokossovskogo,55.814722,37.734167,Подружка,55.814523,37.736781,Cosmetics Shop
1,Bulvar Rokossovskogo,55.814722,37.734167,Burger King,55.814026,37.733659,Fast Food Restaurant
2,Bulvar Rokossovskogo,55.814722,37.734167,Галерея Богородское,55.813944,37.734298,Art Gallery
3,Bulvar Rokossovskogo,55.814722,37.734167,Fitlab studio,55.810845,37.727964,Gym / Fitness Center
4,Bulvar Rokossovskogo,55.814722,37.734167,Ветеринарная Аптека Ветлек,55.813464,37.735036,Pet Store


#### Further steps in week 5
* Get the most common venues per neighborhood
* Cluster nbeighborhoods using K-mean algorithm
* Visualize and describe cluster
* Findings and conclusion
* References