# ** Rec a Business **

This project aims to provide new and upcoming business enthusiasts better options to leverage their capital in. This project uses DelhiNeighborhood dataset incorporated into kaggle by Kumar Shaswat, under the license CC BY-NC-SA 4.0. 

Importing dependencies

In [126]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib as plt
import seaborn as sns
import matplotlib.cm as cm
import matplotlib.colors as colors
import json
import requests
import ipywidgets as widgets
from pandas.io.json import json_normalize
from geopy.distance import great_circle

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 5GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/neighborhood/Sorted Neighborhood dataset.csv
/kaggle/input/delhi-neighborhood-data/delhi_dataSet.csv
/kaggle/input/delhi-neighborhood-data/restaurant_dataSet.csv


## Setting up the background data

Reading the data

In [2]:
df_delhi = pd.read_csv('/kaggle/input/delhi-neighborhood-data/delhi_dataSet.csv',index_col = 'Unnamed: 0')

In [3]:
df_delhi.head()

Unnamed: 0,Borough,Neighborhood,latitude,longitude
0,North West Delhi,Adarsh Nagar,28.614192,77.071541
1,North West Delhi,Ashok Vihar,28.699453,77.184826
2,North West Delhi,Azadpur,28.707657,77.175547
3,North West Delhi,Bawana,28.79966,77.032885
4,North West Delhi,Begum Pur,,


Exploring the data to find missing values

In [4]:
df_delhi.shape

(185, 4)

In [5]:
df_delhi.isnull().sum()

Borough          0
Neighborhood     0
latitude        22
longitude       22
dtype: int64

There are 22 neighborhoods without latitude and longitude.

--------

Segregating the neighborhood without latitude and longitude data into a new dataframe and the rest into another dataframe

In [6]:
df_present = df_delhi.where(df_delhi['latitude'].isnull() == False)
df_present.dropna(subset = ['Borough','Neighborhood'],inplace=True)
df_present.reset_index(inplace=True)
df_present.drop(['index'], axis=1,inplace=True)
df_present.head()

Unnamed: 0,Borough,Neighborhood,latitude,longitude
0,North West Delhi,Adarsh Nagar,28.614192,77.071541
1,North West Delhi,Ashok Vihar,28.699453,77.184826
2,North West Delhi,Azadpur,28.707657,77.175547
3,North West Delhi,Bawana,28.79966,77.032885
4,North West Delhi,Dhaka,39.031714,-90.261223


In [7]:
df_missing = df_delhi.where(df_delhi['latitude'].isnull() == True)
df_missing.dropna(subset = ['Borough','Neighborhood'],inplace=True)
df_missing.reset_index(inplace=True)
df_missing.drop(['index'], axis=1,inplace=True)
df_missing.head()

Unnamed: 0,Borough,Neighborhood,latitude,longitude
0,North West Delhi,Begum Pur,,
1,North West Delhi,Rohini Sub City,,
2,North Delhi,Ghantewala,,
3,North Delhi,Gulabi Bagh,,
4,North Delhi,Sadar Bazaar,,


There is a unwanted symbol set ( [1] ) in the 15th row. 
Cleaning the data

In [8]:
df_missing['Neighborhood'].replace('Sundar Nagar[1]' ,'Sundar Nagar',inplace=True)
df_missing.head()

Unnamed: 0,Borough,Neighborhood,latitude,longitude
0,North West Delhi,Begum Pur,,
1,North West Delhi,Rohini Sub City,,
2,North Delhi,Ghantewala,,
3,North Delhi,Gulabi Bagh,,
4,North Delhi,Sadar Bazaar,,


Manually using google maps and Open Street Maps to find the latitude and longitude of the places.
This process is being done manually because geopy doesn't do very well with locations of India. There are instances of incorrect data, so cleaning them is also required

In [9]:
# defining series lat and lng for assigning the columns
lat = pd.Series([],dtype=float)
lng = pd.Series([],dtype=float)

#Assigning the data

#Bugum Pur
lat[0], lng[0] = 28.727248, 77.064975 

#Rohini Sub City
lat[1],lng[1] = 28.741073, 77.082574

#Ghantewala is a famous sweet shop, that got closed, so I am dropping this data
df_missing.drop([2],inplace=True)

#Gulabi Bagh
lat[3],lng[3] = 28.672190, 77.191620

#Sadar Bazaar 
lat[4],lng[4] = 28.659395, 77.212782

#Tees Hazari
lat[5],lng[5] = 28.665682, 77.216413

#New Usmanpur
lat[6],lng[6] = 28.677737, 77.256637

#Sadatpur
lat[7],lng[7] = 28.726746, 77.261097

#Rajender Nagar
lat[8],lng[8] = 28.641024, 77.185038

#Sadar Bazaar
lat[9],lng[9] = 28.657305, 77.212750

#Laxmibai Nagar
lat[10],lng[10] = 28.575276, 77.209630

#Silampur
lat[11],lng[11] = 28.664181, 77.270916

#Jamroodpur Village
lat[12],lng[12] = 28.557592, 77.237061

#Kotla Mubarakpur
lat[13],lng[13] = 28.575783, 77.227396

#Pulpehaladpur
lat[14],lng[14] = 28.499831, 77.290347

#Sundar Nagar
lat[15],lng[15] = 28.601985, 77.243725

#Dabri
lat[16],lng[16] = 28.611823, 77.087268

#Dwarka Sub City
lat[17],lng[17] = 28.582154, 77.049576

#Sagar Pur
lat[18],lng[18] = 28.605670, 77.099189

#Partap Nagar
df_missing.drop([19],inplace=True)

#Tihar Village
lat[20],lng[20] = 28.634353, 77.107331

#Uttam Nagar
lat[21],lng[21] = 28.619573, 77.054916

In [10]:
df_missing['latitude_mod'] = lat
df_missing['longitude_mod'] = lng

Arranging the data

In [11]:
df_missing.reset_index(inplace=True)
df_missing.drop(columns=['latitude', 'longitude','index'],inplace=True)
df_missing.rename(columns={'latitude_mod' : 'latitude' , 'longitude_mod' : 'longitude' }, inplace=True)
df_missing.head()

Unnamed: 0,Borough,Neighborhood,latitude,longitude
0,North West Delhi,Begum Pur,28.727248,77.064975
1,North West Delhi,Rohini Sub City,28.741073,77.082574
2,North Delhi,Gulabi Bagh,28.67219,77.19162
3,North Delhi,Sadar Bazaar,28.659395,77.212782
4,North Delhi,Tees Hazari,28.665682,77.216413


Combining both the dataframes into a single dataframe.

In [12]:
frames = [df_missing,df_present]
df = pd.concat(frames)
df.head()

Unnamed: 0,Borough,Neighborhood,latitude,longitude
0,North West Delhi,Begum Pur,28.727248,77.064975
1,North West Delhi,Rohini Sub City,28.741073,77.082574
2,North Delhi,Gulabi Bagh,28.67219,77.19162
3,North Delhi,Sadar Bazaar,28.659395,77.212782
4,North Delhi,Tees Hazari,28.665682,77.216413


The dataframe still has some wrong latitude and longitudes. Dropping them

In [13]:
df = df[df['latitude'] < 30]

In [14]:
df.reset_index(inplace=True)
df.drop(['index'],axis=1,inplace=True)

In [15]:
df

Unnamed: 0,Borough,Neighborhood,latitude,longitude
0,North West Delhi,Begum Pur,28.727248,77.064975
1,North West Delhi,Rohini Sub City,28.741073,77.082574
2,North Delhi,Gulabi Bagh,28.672190,77.191620
3,North Delhi,Sadar Bazaar,28.659395,77.212782
4,North Delhi,Tees Hazari,28.665682,77.216413
...,...,...,...,...
168,West Delhi,Rajouri Garden,28.642152,77.116060
169,West Delhi,Shivaji Place,28.651657,77.121703
170,West Delhi,Tilak Nagar,28.639650,77.094039
171,West Delhi,Vikas Nagar,28.644009,77.054470


Verfying the data integrity of the final dataframe

In [16]:
df.shape
#the dataframe has 173 rows with 4 columns

(173, 4)

In [17]:
df.columns
#the column names are in order

Index(['Borough', 'Neighborhood', 'latitude', 'longitude'], dtype='object')

In [18]:
df.isnull().sum()
#there are no missing values

Borough         0
Neighborhood    0
latitude        0
longitude       0
dtype: int64

** We are done with the data cleaning, wrangling and preliminary Exploratory Data Analysis. Moving on the Foursquare API usage **

-----------

## Using Foursquare API

Defining Foursquare API credentials

In [19]:
CLIENT_ID = 'HSKSPYIMS3JWAL4IJ3IWQT1MOLT04ITR1UTIPF3OYZ220JXF' # Foursquare ID
CLIENT_SECRET = 'YS1P0XFBMDIERG42KRRFGQHG4AYWYC4LBNIKTWLFXBLEKDKN' # Foursquare Secret
VERSION = '20200515' # Foursquare API version

print('Your credentails:')
print('CLIENT_ID: ' + CLIENT_ID)
print('CLIENT_SECRET:' + CLIENT_SECRET)

Your credentails:
CLIENT_ID: HSKSPYIMS3JWAL4IJ3IWQT1MOLT04ITR1UTIPF3OYZ220JXF
CLIENT_SECRET:YS1P0XFBMDIERG42KRRFGQHG4AYWYC4LBNIKTWLFXBLEKDKN


Defining a function the get the category of the venue

In [20]:
def get_category_type(row):
    try:
        categories_list = row['categories']
    except:
        categories_list = row['venue.categories']
        
    if len(categories_list) == 0:
        return None
    else:
        return categories_list[0]['name']

Iterating through the data to get the set of all venue categories and storing it in a new dataframe

In [21]:
df_venues = pd.DataFrame(columns = ['Venue Category'])

In [22]:
for j in range(0,183):
    
    try:
        lat,lng = df.iloc[j][2] , df.iloc[j][3]
        radius = 1000
        LIMIT = 50
        url =  'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            lat, 
            lng, 
            radius, 
            LIMIT)

        results = requests.get(url).json()
        venues = results['response']['groups'][0]['items']
        nearby_venues = pd.json_normalize(venues)
        filtered_columns = ['venue.name', 'venue.categories']
        nearby_venues =nearby_venues.loc[:, filtered_columns]
        nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
        nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
        
        l = nearby_venues.shape[0]    
        for i in range(0,l):
            df_venues = df_venues.append({'Venue Category' : nearby_venues.iloc[i][1]} , 
                                           ignore_index=True)
    
    except:
        
        continue



In [23]:
df_venues['frequency'] = 1
df_venues = df_venues.groupby('Venue Category').count()

In [24]:
df_venues.head()

Unnamed: 0_level_0,frequency
Venue Category,Unnamed: 1_level_1
ATM,38
Accessories Store,4
Afghan Restaurant,1
Airport,3
Airport Terminal,2


Manually dropping rows that are not business and cannot be independently set up

In [25]:
df_venues.drop(['ATM','Airport','Airport Terminal','Bus Station','Campground','College Cafeteria','Cricket Ground',
                'Farmers Market','Historic Site','History Museum','IT Services','Lake','Light Rail Station',
                'Molecular Gastronomy Restaurant','Museum','Park','Pool','Road',
                'Stadium','Temple','Tourist Information Center','Trail','Train Station','University'], axis = 0,inplace = True)

In [26]:
df_venues.drop(['Metro Station','Other Great Outdoors','Nightlife Spot','Farm','Track','Astrologer','Zoo'], axis= 0,inplace= True)

-------------

Sorting the dataset to get the top venues

In [27]:
df_venues.sort_values(['frequency'],axis=0,ascending = False,inplace=True)
df_venues.reset_index(inplace=True)
df_venues.head(10)

Unnamed: 0,Venue Category,frequency
0,Indian Restaurant,318
1,Café,189
2,Hotel,168
3,Coffee Shop,135
4,Fast Food Restaurant,133
5,Pizza Place,92
6,Restaurant,71
7,Market,66
8,Chinese Restaurant,66
9,Bar,60


In [28]:
#Merging Cafe and Coffee Shop into one and re-sorting the data

df_venues.replace({'Café' : 'Coffee Shop' , 189 : (189+135)} , inplace = True)
df_venues.drop([3],inplace=True)
df_venues.sort_values(['frequency'],axis=0,ascending = False,inplace=True)
df_venues.reset_index(inplace=True)
df_venues.drop(['index'],axis=1,inplace = True)
df_venues

Unnamed: 0,Venue Category,frequency
0,Coffee Shop,324
1,Indian Restaurant,318
2,Hotel,168
3,Fast Food Restaurant,133
4,Pizza Place,92
...,...,...
175,Bagel Shop,1
176,Video Game Store,1
177,Art Museum,1
178,Basketball Court,1


Verifying the data integrity

In [29]:
df_venues.shape
# The dataframe has 180 rows with 2 columns

(180, 2)

In [30]:
df_venues.isnull().sum().sum()
# The dataframe contains no missing values

0

## Setting up the UI/UX 

------------------------------------------------------------------

### Recommend a business according to the place of choice

Please select your Borough from the respective drop down list - 

In [59]:
bor = 'North West Delhi'
drop_down = widgets.Dropdown(options=df['Borough'].unique(),
                                description='Borough',
                                disabled=False)

def dropdown_handler(change):
    global bor
    bor = change.new

drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Borough', options=('North West Delhi', 'North Delhi', 'North East Delhi', 'Central Delhi…

Please select the Neighborhood from the drop down

In [113]:
opt = []
for i in range(0,173):
    if (df.iloc[i][0] == bor):
        opt.append(df.iloc[i][1])

ngbor = ''
drop_down = widgets.Dropdown(options=opt,
                                description='Neighborhood',
                                disabled=False)

def dropdown_handler(change):
    global ngbor
    ngbor = change.new

drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Neighborhood', options=('Rajender Nagar', 'Sadar Bazaar', 'Chandni Chowk', 'Daryaganj', …

getting the latitude and longitude for the selected neighborhood

In [114]:
for i in range(0,df.shape[0]):
    if (df.iloc[i][0] == bor and df.iloc[i][1] == ngbor):
        ven_lat = df.iloc[i][2]
        ven_lng = df.iloc[i][3]

using Foursquare API to get the existing business in the selected place 

In [115]:
try:
    cnt = 0
    radius = 1000
    LIMIT = 50
    url =  'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
        CLIENT_ID, 
        CLIENT_SECRET, 
        VERSION, 
        ven_lat, 
        ven_lng, 
        radius, 
        LIMIT)

    results = requests.get(url).json()
    venues = results['response']['groups'][0]['items']
    nearby_venues = pd.json_normalize(venues)
    filtered_columns = ['venue.name', 'venue.categories']
    nearby_venues =nearby_venues.loc[:, filtered_columns]
    nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
    nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]

except:
    
    cnt = 1

comparing with the sorted dataset to find the best recommendations

In [116]:
if(cnt == 0):
    
    ven_business = nearby_venues.categories.unique()
    rcs = []
    counter = 0

    l = df_venues.shape[0]
    vl = len(ven_business)
    for i in range(0,l):
        for j in range(0,vl):
            if(df_venues.iloc[i][0] == ven_business[j]):
                counter = 1 
                break

        if(counter == 0):
            rcs.append(df_venues.iloc[i][0])
        else:
            counter = 0

else:
    
    cnt = 0
    rcs = df_venues['Venue Category']

In [117]:
print("For the user selected neighborhood at - " + bor + ", " + ngbor + " Following are the 10 best reccomendations for new business ideas that could be set up - ")
print()

for i in range(0,10):
    print(rcs[i])

For the user selected neighborhood at - Central Delhi, Daryaganj Following are the 10 best reccomendations for new business ideas that could be set up - 

Coffee Shop
Fast Food Restaurant
Pizza Place
Restaurant
Market
Chinese Restaurant
Bar
Asian Restaurant
Bakery
Donut Shop


-------------------------------------------------------------------------------------     

### Recommend the optimal neighborhood for the business of choice

Choose the type of business  

In [118]:
a = 'Popular business ideas'
b = 'Facny business ideas'

choice = a
drop_down = widgets.Dropdown(options= [a,b],
                                description='Choose - ',
                                disabled=False)

def dropdown_handler(change):
    global choice
    choice = change.new

drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Choose - ', options=('Popular business ideas', 'Facny business ideas'), value='Popular b…

In [119]:
if(choice == a):
    chc = df_venues["Venue Category"][:25]
else:
    chc = df_venues['Venue Category'][:-25:-1]

Choose the business

In [120]:
b_choice = ''
drop_down = widgets.Dropdown(options= chc,
                                description='Choose - ',
                                disabled=False)

def dropdown_handler(change):
    global b_choice
    b_choice = change.new

drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Choose - ', options=('Health Food Store', 'Basketball Court', 'Art Museum', 'Video Game …

Select your current borough and neighborhood.

In [121]:
bor = 'North West Delhi'
drop_down = widgets.Dropdown(options=df['Borough'].unique(),
                                description='Borough',
                                disabled=False)

def dropdown_handler(change):
    global bor
    bor = change.new

drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Borough', options=('North West Delhi', 'North Delhi', 'North East Delhi', 'Central Delhi…

In [122]:
opt = []
for i in range(0,df.shape[0]):
    if (df.iloc[i][0] == bor):
        opt.append(df.iloc[i][1])

ngbor = ''
drop_down = widgets.Dropdown(options=opt,
                                description='Neighborhood',
                                disabled=False)

def dropdown_handler(change):
    global ngbor
    ngbor = change.new

drop_down.observe(dropdown_handler, names='value')
display(drop_down)

Dropdown(description='Neighborhood', options=('Rajender Nagar', 'Sadar Bazaar', 'Chandni Chowk', 'Daryaganj', …

In [123]:
df_dis = pd.DataFrame(columns = ['Neighborhood','Distance in km' , 'latitude' , 'longitude'])

for i in range(0,df.shape[0]):
    if (df.iloc[i][0] == bor and df.iloc[i][1] == ngbor):
        ven_lat = df.iloc[i][2]
        ven_lng = df.iloc[i][3]

In [124]:
dis = []
lat = []
lng = []
neighbor = pd.Series(opt,dtype = object)
df_dis['Neighborhood'] = neighbor

for i in range(0,df.shape[0]):
    if(df.iloc[i][0] == bor):
        it_neighborhood = (df.iloc[i][2] , df.iloc[i][3])
        lat.append(df.iloc[i][2])
        lng.append(df.iloc[i][3])
        selected_neighborhood = (ven_lat, ven_lng) 
        dist = great_circle(it_neighborhood,selected_neighborhood).km
        dist = format(dist, '.3f')
        dis.append(dist)

geo_dist = pd.Series(dis,dtype = float)
df_dis['Distance in km'] = geo_dist

df_dis['latitude'] = pd.Series(lat,dtype=float)
df_dis['longitude'] = pd.Series(lng,dtype=float)
        
df_dis.sort_values(['Distance in km'],axis = 0,ascending = True , inplace = True)
df_dis.reset_index(inplace = True)
df_dis.drop(['index'],axis=1,inplace=True)

In [125]:
for j in range(0,df_dis.shape[0]):
    
    try:
        counter = 0
        radius = 1000
        LIMIT = 50
        ven_lat = df_dis.iloc[j][2]
        ven_lng = df_dis.iloc[j][3]
        url =  'https://api.foursquare.com/v2/venues/explore?&client_id={}&client_secret={}&v={}&ll={},{}&radius={}&limit={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION, 
            ven_lat, 
            ven_lng, 
            radius, 
            LIMIT)

        results = requests.get(url).json()
        venues = results['response']['groups'][0]['items']
        nearby_venues = pd.json_normalize(venues)
        filtered_columns = ['venue.name', 'venue.categories']
        nearby_venues =nearby_venues.loc[:, filtered_columns]
        nearby_venues['venue.categories'] = nearby_venues.apply(get_category_type, axis=1)
        nearby_venues.columns = [col.split(".")[-1] for col in nearby_venues.columns]
        
        ven_buss = pd.Series( (nearby_venues['categories'].unique()) ,dtype=object )
        v_length = ven_buss.shape[0]
        
        for k in range(0,v_length):
            if(b_choice == ven_buss[k]):
                counter = 1
                break
        
        if(counter == 0):
            
            print("The optimal place to start your business, " + b_choice + " is in - " + df_dis.iloc[j][0] + ", " + bor)
            break
        
        
    except:
        
        print("The optimal place to start your business, " + b_choice + " is in - " + df_dis.iloc[j][0] + ", " + bor)
    

The optimal place to start your business, Pharmacy is in - Daryaganj, Central Delhi
