In [1]:
import pandas as pd
import numpy as np
import statsmodels.formula.api as smf
import matplotlib.pylab as plt
import math
from scipy import stats
import geopandas as gpd
%matplotlib inline
from functools import reduce

In [2]:
import importlib
import mr_word_count
importlib.reload(mr_word_count)
from mr_word_count import MRWordFrequencyCount
from mrjob.job import MRJob
import mapreduce as mr

In [3]:
import nltk
from bs4 import BeautifulSoup
import string
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

In [4]:
#LiveXYZ -- dataframe of XYZ survey data pulled from CSV
Yelp = pd.read_csv('yelp_business_zipcodes_with_reviews.csv')

In [7]:
#drop all na values from zipcode
Yelp = Yelp[Yelp['zipcode'].notna()]

4354

In [8]:
#drop all na values from all_reviews
Yelp = Yelp[Yelp['all_reviews'].notna()]

4354

In [10]:
#convert the zip code values to int so they can be filtered
Yelp['zipcode'] = Yelp['zipcode'].astype(int)

#filter the zip code values so that only NYC zipcodes remain
Yelp = Yelp[(Yelp['zipcode'] >= 10001) & (Yelp['zipcode'] <= 11697)]

#turn all_reviews column into a string type column
Yelp['all_reviews'] = Yelp['all_reviews'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [14]:
#these are the only zip codes that are being analyzed in the study
pertinent_zips = [11102, 11103, 11105, 11106, 11101, 11104, 11109, 11120, 11206, 11211, 11249, 11207, 11221, 11237, 
                 10026, 10027, 10037, 10030, 10039, 10001, 10011, 10018, 10019, 10020, 10036, 10002, 10034, 10040,
                 10454, 10455, 10459, 10474, 11205, 11216, 11221, 11233, 11238, 11385, 11386, 10109, 11373,
                 11379, 11372, 11354, 11355, 11358, 11222]

In [16]:
#filter the dataframe so that only the rows with pertinent zip code values remain
FilteredByZip = Yelp[Yelp['zipcode'].isin(pertinent_zips)]

In [18]:
#dictionary to map zips to neighborhoods
zip_dict ={ 11101:'Astoria/LIC', 11102:'Astoria/LIC', 11103:'Astoria/LIC', 11105:'Astoria/LIC', 11106:'Astoria/LIC', 11104:'LIC',
           11109:'LIC', 11120:'LIC', 11206:'Williamsburg/Bushwick/Bed-Stuy', 11211:'Williamsburg', 11249:'Williamsburg', 11207:'Bushwick',
           11221:'Bed-Stuy/Bushwick', 11237:'Bushwick', 10026:'Harlem', 10027:'Harlem', 10037:'Harlem', 10030:'Harlem', 10039:'Harlem',
           10001:'Chelsea', 10011:'Chelsea', 10018:'Chelsea', 10019:'Chelsea', 10020:'Chelsea', 10036:'Chelsea', 10002:'LES',
           10034: 'Inwood', 10040: 'Inwood', 10454: 'South Bronx', 10455: 'South Bronx', 10459: 'South Bronx', 10474: 'South Bronx', 
           11205: 'Bed-Stuy', 11216: 'Bed-Stuy', 11233: 'Bed-Stuy', 11238: 'Bed-Stuy',
           11385: 'Ridgewood', 11386: 'Ridgewood', 10109: 'Times Square', 
           11373: 'Elmhurst', 11379: 'Elmhurst', 11372: 'Jackson Heights', 
           11354: 'Flushing', 11355: 'Flushing', 11358: 'Flushing', 11222: 'Greenpoint'}

In [19]:
#create a column called Neighborhood that maps the zipcode key to the dictionary value (Neighborhood)
FilteredByZip['Neighborhood']= FilteredByZip['zipcode'].map(zip_dict)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [32]:
#turn each neighborhood's dataframe into its own csv file
ChelseaYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Chelsea']
WilliamsburgYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Williamsburg']
Astoria_LICYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Astoria/LIC']
LESYelp = FilteredByZip[FilteredByZip['Neighborhood']=='LES']
GreenpointYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Greenpoint']
BushwickYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Bushwick']
LICYelp = FilteredByZip[FilteredByZip['Neighborhood']=='LIC']
Williamsburg_Bushwick_BedStuyYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Williamsburg/Bushwick/Bed-Stuy']
BedStuyYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Bed-Stuy']
BedStuy_BushwickYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Bed-Stuy/Bushwick']
RidgewoodYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Ridgewood']
ElmhurstYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Elmhurst']
JacksonHeightsYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Jackson Heights']
FlushingYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Flushing']
SouthBronxYelp = FilteredByZip[FilteredByZip['Neighborhood']=='South Bronx']
HarlemYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Harlem']
InwoodYelp = FilteredByZip[FilteredByZip['Neighborhood']=='Inwood']

In [37]:
ChelseaYelp.to_csv ('ChelseaYelpDescription.csv', index = False, header=True)
WilliamsburgYelp.to_csv ('WilliamsburgYelpDescription.csv', index = False, header=True)
Astoria_LICYelp.to_csv ('Astoria_LICYelpDescription.csv', index = False, header=True)
LESYelp.to_csv ('LESYelpDescription.csv', index = False, header=True)
GreenpointYelp.to_csv ('GreenpointYelpDescription.csv', index = False, header=True)
BushwickYelp.to_csv ('BushwickYelpDescription.csv', index = False, header=True)
LICYelp.to_csv ('LICYelpDescription.csv', index = False, header=True)
Williamsburg_Bushwick_BedStuyYelp.to_csv ('Williamsburg_Bushwick_BedStuyYelpDescription.csv', index = False, header=True)
BedStuyYelp.to_csv ('BedStuyYelpYelpDescription.csv', index = False, header=True)
BedStuy_BushwickYelp.to_csv ('BedStuy_BushwickYelpDescription.csv', index = False, header=True)
RidgewoodYelp.to_csv ('RidgewoodYelpDescription.csv', index = False, header=True)
ElmhurstYelp.to_csv ('ElmhurstYelpDescription.csv', index = False, header=True)
JacksonHeightsYelp.to_csv ('JacksonHeightsYelpDescription.csv', index = False, header=True)
FlushingYelp.to_csv ('FlushingYelpDescription.csv', index = False, header=True)
SouthBronxYelp.to_csv ('SouthBronxYelpDescription.csv', index = False, header=True)
HarlemYelp.to_csv ('HarlemYelpDescription.csv', index = False, header=True)
InwoodYelp.to_csv ('InwoodYelpDescription.csv', index = False, header=True)