# <body>
<h1 style="text-align:center;"> Data Science </h1>
<h4 style="text-align:center;">  Guilherme Araújo  e  Gabriel Novais </h4>
<p  align="justify"> <b> Description </b> : The main goal of our data science work is modelling the variation of the rental prices offered by Airbnb users according to the level of crime in the region. In order to make such work we will need to prepare data then visualize all information with some specific maps and graphs. The last part will be the statistical modelling. Either machine learning and regression techniques will be compared .</p>
    <p><b>Sources and Links:</b></p>

<b>Airbnb</b>
<li><a href="http://insideairbnb.com/get-the-data.html">http://insideairbnb.com/get-the-data.html</a></li>

<b>Chicago</b>
    <li><a>https://data.cityofchicago.org/Public-Safety/Crimes-Map/dfnk-7re6</a></li>
    <li><a>https://data.cityofchicago.org/Public-Safety/Crimes-One-year-prior-to-present/x2n5-8w5q/data</li></a>
    <li><a>https://data.cityofchicago.org/browse?category=Public+Safety</a></li>

<b>New York</b>
<li><a href="https://data.cityofnewyork.us/Public-Safety/Crime-Map-/5jvd-shfj">https://data.cityofnewyork.us/Public-Safety/Crime-Map-/5jvd-shfj</a></li>
<li><a href="https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243/data">https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243/data</a></li>
<li><a href="https://data.cityofnewyork.us/browse?category=Public+Safety">https://data.cityofnewyork.us/browse?category=Public+Safety</a></li>
    
</body>

### Packages

In [12]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import glob
import re

from math import radians, sin, cos, acos, log, pi, tan, asin,sqrt
from decimal import Decimal
from bokeh.plotting import figure, show, output_notebook
from bokeh.tile_providers import CARTODBPOSITRON
from ast import literal_eval
from scipy import stats

### Functions

In [3]:
def merc(Coords):
    Coordinates = (Coords)
    lat = Coordinates[0]
    lon = Coordinates[1]
    
    r_major = 6378137.000
    x = r_major * radians(lon)
    scale = x/lon
    y = 180.0/pi * log(tan(pi/4.0 + 
        lat * (pi/180.0)/2.0)) * scale
    return (x, y)

In [4]:
def distance(a,b):
    """
    Calculate the great circle distance between two points
    on the earth (specified in decimal degrees). Output in KM
    """
    lat1 = a[0]
    lat2 = b[0]
    lon1 = a[1]
    lon2 = b[1]
    # convert decimal degrees to radians
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
    c = 2 * asin(sqrt(a))
    km = 6371 * c
    return km

In [5]:
def result(x,y):
    gradient,intercept,r_value,p_value,std_err=stats.linregress(x,y)
    if p_value<0.05:
        pvalue = 1
    else:
        pvalue = 0
    if gradient<0:
        coef = 1
    else:
        coef = 0
    return [coef,pvalue]    

### Importing Data (Airbnb)

In [6]:
listings = pd.read_csv("/home/novais/Desktop/Mestrado/2_trim_2019/FundamentosDS/DataScienceEMAp_AraujoNovais/dados/listings/listings_19_1_17.csv")
listings = listings[['id','latitude','longitude']]
calendar = pd.read_csv("/home/novais/Desktop/Mestrado/2_trim_2019/FundamentosDS/DataScienceEMAp_AraujoNovais/dados/calendar/calendar_19_1_17.zip")
calendar = calendar[['listing_id','date','available','price']]
calendar = calendar.rename(index=str, columns={"listing_id": "id"})

In [7]:
hosts = pd.merge(calendar, listings, on='id')
hosts['location'] = list(zip(hosts.latitude, hosts.longitude))
hosts = hosts.drop(['latitude', 'longitude'], axis=1)
hosts.date = hosts.date.apply(lambda x: x[5:7]+'/'+x[8:10]+'/'+x[0:4])
hosts = hosts.dropna(subset=['price'])
hosts.price = hosts.price.apply(lambda x: float(re.sub("[^\d\.]", "", (x[1:-3]))))
hosts.available = hosts.available.apply(lambda x: 0 if x=="f" else 1)
hosts['month']=hosts.date.apply(lambda x: int(x[0:2]))
hosts['day']=hosts.date.apply(lambda x: int(x[3:5]))
hosts['year']=hosts.date.apply(lambda x: int(x[6:10]))
hosts.head()

Unnamed: 0,id,date,available,price,location,month,day,year
0,2384,01/17/2019,0,50.0,"(41.788864900877655, -87.58670890962763)",1,17,2019
1,2384,01/18/2019,0,50.0,"(41.788864900877655, -87.58670890962763)",1,18,2019
2,2384,01/19/2019,0,50.0,"(41.788864900877655, -87.58670890962763)",1,19,2019
3,2384,01/20/2019,0,50.0,"(41.788864900877655, -87.58670890962763)",1,20,2019
4,2384,01/21/2019,0,50.0,"(41.788864900877655, -87.58670890962763)",1,21,2019


### Importing Data (Crimes)

In [7]:
directory = "/home/novais/Desktop/Mestrado/FundamentosDS/Trabalho/dados/Chicago/crimes/"
file = directory + 'Crimes_-_One_year_prior_to_present.csv'
crimes = pd.read_csv(file)
crimes = crimes[['CASE#','DATE  OF OCCURRENCE','LATITUDE','LONGITUDE']]
crimes['location'] = list(zip(crimes.LATITUDE, crimes.LONGITUDE))
crimes = crimes.rename(index=str, columns={"CASE#": "id","DATE  OF OCCURRENCE": "date"})
crimes = crimes.drop(['LATITUDE', 'LONGITUDE'], axis=1)
crimes.date = crimes.date.apply(lambda x: x[0:10])
crimes['month']=crimes.date.apply(lambda x: int(x[0:2]))
crimes['day']=crimes.date.apply(lambda x: int(x[3:5]))
crimes['year']=crimes.date.apply(lambda x: int(x[6:10]))
crimes.head()

Unnamed: 0,id,date,location,month,day,year
0,JB341693,07/09/2018,"(41.894327845999996, -87.62814321)",7,9,2018
1,JB342588,07/09/2018,"(41.773418836, -87.708856396)",7,9,2018
2,JB341677,07/09/2018,"(41.75159879, -87.647708896)",7,9,2018
3,JB341675,07/09/2018,"(41.69219942, -87.65534793399999)",7,9,2018
4,JB341740,07/09/2018,"(41.78559336, -87.621694669)",7,9,2018


### Number of "near crimes" 

In [8]:
crimes2019 = crimes[crimes.year==2019]
hosts2019 = hosts[hosts.year==2019]
crimes2019 = crimes2019[crimes2019.location>(0,0)]
hosts2019 = hosts2019[hosts2019.location>(0,0)]

In [9]:
dict_crimes = crimes2019.groupby('date')['location'].apply(list).to_dict()
data = {'date':list(dict_crimes.keys()),'crimes_locations':list(dict_crimes.values())}
crimes2019compressed = pd.DataFrame(data)
database = pd.merge(hosts2019, crimes2019compressed, on='date',how='left')
database = database.dropna(subset=['crimes_locations','price'])
database = database.reset_index()
database.head()

Unnamed: 0,id,date,available,price,location,month,day,year,crimes_locations
0,4505,05/17/2019,1,150.0,"(41.85495262496367, -87.69696178980682)",5,17,2019,"[(41.840632206, -87.658003656), (41.775429728,..."
1,4505,05/16/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,16,2019,"[(41.75599883, -87.615069189), (41.966234586, ..."
2,4505,05/15/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,15,2019,"[(41.691413679, -87.66882624600001), (41.97964..."
3,4505,05/14/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,14,2019,"[(41.921021491, -87.69730355), (41.79473075699..."
4,4505,05/13/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,13,2019,"[(41.818775036, -87.696606754), (41.762302081,..."


In [13]:
# CUIDADO!!! : esse comando leva 2 horas e 15 minutos para rodar as 536596 linhas.
aux = []
for i in range(0,database.shape[0]):
    q=0
    for j in database.crimes_locations[i]:
        if distance(j,database.location[i])<1:
            q=q+1
    aux.append(q)    

In [16]:
database['near_crimes'] = aux
database.head()

Unnamed: 0,index,id,date,available,price,location,month,day,year,crimes_locations,near_crimes
0,0,4505,05/17/2019,1,150.0,"(41.85495262496367, -87.69696178980682)",5,17,2019,"[(41.840632206, -87.658003656), (41.775429728,...",9
1,1,4505,05/16/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,16,2019,"[(41.75599883, -87.615069189), (41.966234586, ...",7
2,2,4505,05/15/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,15,2019,"[(41.691413679, -87.66882624600001), (41.97964...",4
3,3,4505,05/14/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,14,2019,"[(41.921021491, -87.69730355), (41.79473075699...",5
4,4,4505,05/13/2019,1,120.0,"(41.85495262496367, -87.69696178980682)",5,13,2019,"[(41.818775036, -87.696606754), (41.762302081,...",6


### Hosts

In [30]:
id_hosts = pd.unique(database.id)
print("Number of Hosts in database: "+str(len(id_hosts)))

Number of Hosts in database: 3920


In [42]:
# Example: Relation between price and near_crimes for each host 
host_2384 = database[database.id==2384]
host_2384.reset_index().drop(['index','level_0'],axis=1).head()

Unnamed: 0,id,date,available,price,location,month,day,year,crimes_locations,near_crimes
0,2384,06/10/2019,1,90.0,"(41.788864900877655, -87.58670890962763)",6,10,2019,"[(41.947252263, -87.65350869), (41.742736056, ...",2
1,2384,06/09/2019,1,90.0,"(41.788864900877655, -87.58670890962763)",6,9,2019,"[(41.80004573, -87.695196828), (41.94466821, -...",3
2,2384,06/08/2019,1,90.0,"(41.788864900877655, -87.58670890962763)",6,8,2019,"[(41.864169598000004, -87.704380001), (41.8900...",0
3,2384,06/07/2019,1,90.0,"(41.788864900877655, -87.58670890962763)",6,7,2019,"[(41.836173903, -87.721893881), (41.777876516,...",2
4,2384,06/06/2019,1,90.0,"(41.788864900877655, -87.58670890962763)",6,6,2019,"[(41.783831376, -87.587493905), (41.737953878,...",4


In [39]:
# Correlation:
np.corrcoef(host_2384.price,host_2384.near_crimes)

array([[ 1.        , -0.12629818],
       [-0.12629818,  1.        ]])

In [47]:
# Looking for evidences:
x=host_2384.near_crimes
y=host_2384.price
result(x,y)

[1, 0]

### Regression for each Host:

In [2]:
hosts_pandas = {}
for i in id_hosts:
    key = "host_"+str(i)
    hosts_pandas[key]=database[database.id==i]

NameError: name 'id_hosts' is not defined

In [1]:
results = []
for i in list(hosts_pandas.keys()):
    x=hosts_pandas[i].near_crimes
    y=hosts_pandas[i].price
    a = np.array(result(x,y))
    results.append(a)
final = sum(results)
good_coef = final[0]/len(id_hosts)
good_pvalue = final[1]/len(id_hosts)
print("% Regressions with negative coefficient for crime: "+str(good_coef))
print("% Regressions with little p-value: "+str(good_pvalue))

NameError: name 'hosts_pandas' is not defined

### Map with unique hosts

In [37]:
from bokeh.embed import components
from bokeh.core import templates
N = 4000

x = np.random.random(size=N)*100
y = np.random.random(size=N)*100
radii = np.random.random(size=N)*1.5

pl = figure()
pl.circle(x, y, radius=radii, fill_alpha=0.6, line_color=None)
pl_script, pl_div = components(plot)

In [41]:
output_notebook()
show(pl)

In [42]:
print(plot_script)


<script type="text/javascript">
  (function() {
    var fn = function() {
      Bokeh.safely(function() {
        (function(root) {
          function embed_document(root) {
            
          var docs_json = '{"521587c2-7e0c-4af0-93d1-e72934fa92b7":{"roots":{"references":[{"attributes":{"source":{"id":"1591","type":"ColumnDataSource"}},"id":"1595","type":"CDSView"},{"attributes":{"fill_alpha":{"value":0.6},"fill_color":{"value":"#1f77b4"},"line_color":{"value":null},"radius":{"field":"radius","units":"data"},"x":{"field":"x"},"y":{"field":"y"}},"id":"1592","type":"Circle"},{"attributes":{},"id":"1564","type":"LinearScale"},{"attributes":{},"id":"1576","type":"PanTool"},{"attributes":{"callback":null,"data":{"radius":{"__ndarray__":"tiplgry+4D/F6FenVELQP68N+ZpoRNE/4tqUJ/G+9D/clCsKeanCP/TCIbkmgeE/alSjPeTE8j8eslPKBgDxP3eNKVx6w9w/UFCZmkZKrz/Da8XWiNrpP3hAzLG6T7Y/At40C/kNyD/X5n3abVbvP/WHVVe/VfE/1oJDl0qE6z8CrXU/gAv0P2WP/S8BrNI/a2DV8bdw8j9IQ4yKGd3YP4yWh2Bm6rk/s3fgPTbI8j90952vquvgPwGqN05M

In [43]:
print(plot_div)


<div class="bk-root" id="a99bb169-9dcc-43e7-a8bb-fe3a44dd93b9"></div>


In [48]:
from bokeh.plotting import figure, output_file, show

output_file("toolbar.html")

# create a new plot with the toolbar below
pat = figure(plot_width=400, plot_height=400,
           title=None, toolbar_location="below")

pat.circle([1, 2, 3, 4, 5], [2, 5, 8, 2, 7], size=10)

In [1]:
hosts['coords_x'] = hosts['location'].apply(lambda x: merc(x)[0])
hosts['coords_y'] = hosts['location'].apply(lambda x: merc(x)[1])
p = figure(x_axis_type="mercator", y_axis_type="mercator")
p.add_tile(CARTODBPOSITRON)
p.circle(x = hosts['coords_x'][:50],
        y = hosts['coords_y'][:50])
output_notebook()
show(p)

NameError: name 'hosts' is not defined

In [8]:
from bokeh.plotting import figure
from bokeh.resources import CDN
from bokeh.embed import file_html

plot = figure()
plot.circle([1,2], [3,4])

html = file_html(plot, CDN, "my plot")

In [10]:
print(html)





<!DOCTYPE html>
<html lang="en">
  
  <head>
    
      <meta charset="utf-8">
      <title>my plot</title>
      
      
        
          
        <link rel="stylesheet" href="https://cdn.pydata.org/bokeh/release/bokeh-1.0.2.min.css" type="text/css" />
        
        
          
        <script type="text/javascript" src="https://cdn.pydata.org/bokeh/release/bokeh-1.0.2.min.js"></script>
        <script type="text/javascript">
            Bokeh.set_log_level("info");
        </script>
        
      
      
    
  </head>
  
  
  <body>
    
      
        
          
          
            
              <div class="bk-root" id="6085e41f-65f1-4dfd-8251-75ef6e6cd214"></div>
            
          
        
      
      
        <script type="application/json" id="1113">
          {"30e523ca-5d39-4cfd-a9f4-e28211f53bc5":{"roots":{"references":[{"attributes":{"overlay":{"id":"1029","type":"BoxAnnotation"}},"id":"1023","type":"BoxZoomTool"},{"attributes":{},"id":"1047","type":"Se