# Our first steps into exploring the data with Mongo

In [205]:
from pymongo import MongoClient
from dotenv import load_dotenv
import os
import requests
import json

In [206]:
client = MongoClient("mongodb://localhost")
db = client.get_database("companydb")

### We are validating our connection to the database that we want.

In [207]:
comp_find = db.companies.find({})
comp_find = list(comp_find)
len(comp_find)

18801

In [208]:
comp = db.companies

### We are going to elminate all the dictionaries without lat/long values and city values and limit results to US

In [209]:

res = comp.aggregate([
                      {"$unwind":"$offices" },
                      {"$match":{"offices.latitude":{"$ne":None}}},
                      {"$match":{"offices.longitude":{"$ne":None}}},
                      {"$match":{"offices.city":{"$ne":None}}},
                      {"$match":{"offices.city":{"$ne": ""}}},
                      {"$match":{"offices.country_code": "USA"}},
                      {"$project":{"_id":0}}
                    ])


In [37]:
type(res)

pymongo.command_cursor.CommandCursor

In [32]:
len(list(res))

10834

### The previous line and the following line were the steps that called our attention to the fact that `res` is an interator type, we now recognize as CommandCursor and it gave us an ERROR.

In [29]:
len(list(res))

0

### This will create a new collection of the offices. (It should only execute once because it will continue to add documents to the collection

In [78]:
db.us_offices.insert_many(res)


<pymongo.results.InsertManyResult at 0x12d3f0040>

## Error Resolved
- I was recieving error after error with this previous command. Meaning I could not unwind the collection and filter to create another collection.
#### Here were the steps I followed when I discovered the problem:
    - 1) ejecute ".agg"
    - 2) chech "len"
    - 3) try to view [0] from the list in the third cell and it gave me "out of range" error
    - 4) tried again "len" en el tercer cell y me daba 0
#### SO I KNOW THIS IS AN ITERATOR THAT ITERATED TO ITS LIMIT
    - 5) second time ejecute ".agg"
    - 6) instead of trying "len",  I went directly to ejecute ".insert_many"

----------------------

## Objective
- find the cities that have the highest number of tech companies that have raised more than 1 Million

In [210]:
filter_q = { }
project = {"_id":0,"name":1, "offices.city": 1, "offices.state_code":1, "total_money_raised": 1 }
res = list(db.us_offices.find(filter_q, project).sort([("offices.city",1)]))
# res

In [211]:
# print([document["offices"]['city'] for document in res])

### We use this `for` loop to add up the numbers of companies in each city. 
- This will give us a sorted list of those cities from highest to lowest.

In [212]:
companies_in_city = {}
for comp in res:
    city = comp["offices"]["city"]
    companies_in_city[city] = companies_in_city.get(city,0) + 1
    
    
top_cities = {k: v for k, v in sorted(companies_in_city.items(), key=lambda item: item[1], reverse=True)[:20]}
top_cities

{'San Francisco': 659,
 'New York': 579,
 'Seattle': 179,
 'Los Angeles': 168,
 'Austin': 166,
 'Palo Alto': 163,
 'Sunnyvale': 148,
 'Mountain View': 147,
 'San Jose': 140,
 'Chicago': 137,
 'San Diego': 114,
 'San Mateo': 98,
 'Santa Clara': 98,
 'Boston': 89,
 'Atlanta': 85,
 'Cambridge': 76,
 'Redwood City': 71,
 'Santa Monica': 60,
 'Menlo Park': 57,
 'Boulder': 56}

### Look at that! That's nice! We also want to know what type of businesses are in each city.
- We will define a function **startups_in_city** that will take a city name and return a list of young companies. 

In [213]:
def startups_in_city(city_name):
    filter_q = {"offices.city": city_name,"founded_year":{"$gte":2010}}
    project = {"_id":0,"name":1, "founded_year":1, "category_code":1, "total_money_raised": 1 }
    res = list(db.us_offices.find(filter_q, project).sort([("category_code",1)]))
    return res



### Starting at the top of our list in Seattle with 659 companies, we see that there are 3 young companies in San Fran that seem tech related that have raised more than 1 million.

In [219]:
San_Fran = startups_in_city("San Francisco")
San_Fran


[{'name': 'MyLikes',
  'category_code': 'advertising',
  'founded_year': 2010,
  'total_money_raised': '$6.23M'},
 {'name': 'YouNoodle',
  'category_code': 'analytics',
  'founded_year': 2010,
  'total_money_raised': '$0'},
 {'name': 'Shopseen',
  'category_code': 'ecommerce',
  'founded_year': 2013,
  'total_money_raised': '$0'},
 {'name': 'Advisor',
  'category_code': 'enterprise',
  'founded_year': 2011,
  'total_money_raised': '$0'},
 {'name': 'Fliptop',
  'category_code': 'enterprise',
  'founded_year': 2010,
  'total_money_raised': '$6.79M'},
 {'name': 'Indee',
  'category_code': 'games_video',
  'founded_year': 2010,
  'total_money_raised': '$0'},
 {'name': 'Fliggo',
  'category_code': 'games_video',
  'founded_year': 2012,
  'total_money_raised': '$0'},
 {'name': 'Widgetbox',
  'category_code': 'other',
  'founded_year': 2012,
  'total_money_raised': '$14.5M'}]

### Surprisingly, we have only 2 New York startups with significant money_raised.

In [180]:
NY = startups_in_city("New York")
NY

[{'name': 'Advaliant',
  'category_code': 'advertising',
  'founded_year': 2013,
  'total_money_raised': '$100k'},
 {'name': 'Advaliant',
  'category_code': 'advertising',
  'founded_year': 2013,
  'total_money_raised': '$100k'},
 {'name': 'Kidos',
  'category_code': 'games_video',
  'founded_year': 2011,
  'total_money_raised': '$200k'},
 {'name': 'Kidos',
  'category_code': 'games_video',
  'founded_year': 2011,
  'total_money_raised': '$200k'},
 {'name': 'PeekYou',
  'category_code': 'search',
  'founded_year': 2012,
  'total_money_raised': '$1.83M'},
 {'name': 'Unison Technologies',
  'category_code': 'software',
  'founded_year': 2011,
  'total_money_raised': '$0'},
 {'name': 'Yipit',
  'category_code': 'web',
  'founded_year': 2010,
  'total_money_raised': '$7.55M'}]

### So far LA is the place to beat with 3 "start-ups" (two in E-commerce and one in the video game industry).

In [184]:
LA = startups_in_city("Los Angeles")
LA

[{'name': 'RazorGator',
  'category_code': 'ecommerce',
  'founded_year': 2011,
  'total_money_raised': '$58.8M'},
 {'name': 'Magento',
  'category_code': 'ecommerce',
  'founded_year': 2010,
  'total_money_raised': '$22.5M'},
 {'name': 'Social Gaming Network',
  'category_code': 'games_video',
  'founded_year': 2011,
  'total_money_raised': '$17.1M'},
 {'name': 'FirstString',
  'category_code': 'search',
  'founded_year': 2011,
  'total_money_raised': '$200k'}]

# Let's go to LA for our API search. There should be no problem finding some vegan restaurants and Starubucks around there!!! We're off to the city of the Big Lebowski...

![Jeff Lebowski looking happy!](Lebowski.jpeg "Maauuuud!")

(Get [API's](/notebooks/Google%20API%20%20GeoCoding%20%26%20Places%20.ipynb#Looks-like-our-vegan-bet-paid-off) next)