### Importing Libraries and pymongo

In [31]:
from pymongo import MongoClient
import pandas as pd

client = MongoClient('mongodb://localhost:27017/')
db = client.companies

## Making a Query, with a focus on tech-industry companies, with at least 1 office and 1 employee, founded from 1990 onwards to avoid the oldest companies

In [32]:
#Choose top 5 tech categories, founded in last 10 years of database, at least 2 employees, at least 1 office
filtered = db.companies.find({
    "$and": [ {"$or":[{"category_code":"web"}, 
                      {"category_code":"software"},
                      {"category_code":"games_video"},
                      {"category_code":"mobile"}, 
                      {"category_code":"network_hosting"}]},
             {"offices":{"$not":{"$size":0 }}},
             {"founded_year": { "$gte":1990}}, 
             {"number_of_employees": {"$gte":1}}]},
    {"name":1, "offices":1,"category_code":1, "founded_year":1, "number_of_employees":1})






In [33]:
df = pd.DataFrame(filtered)
print(df.shape)
df.head()



(3876, 6)


Unnamed: 0,_id,category_code,founded_year,name,number_of_employees,offices
0,52cdef7c4bab8bd675297d8a,web,2005,Wetpaint,47,"[{'description': '', 'address1': '710 - 2nd Av..."
1,52cdef7c4bab8bd675297d8c,software,2005,Zoho,1600,"[{'description': 'Headquarters', 'address1': '..."
2,52cdef7c4bab8bd675297d91,web,2006,Geni,18,"[{'description': 'Headquarters', 'address1': '..."
3,52cdef7c4bab8bd675297da1,web,2002,Plaxo,50,"[{'description': 'HQ', 'address1': '1050 Enter..."
4,52cdef7c4bab8bd675297d9b,web,1995,eBay,15000,"[{'description': 'Headquarters', 'address1': '..."


## I obtain the geolocation and city information of the main headquarters of every selected company and I concat it to the previous dataframe

In [34]:
def getFirst(data):
    data = data['offices']
    principal = None
    if data[0]['latitude'] and data[0]['longitude']:
        principal = {
            "type":"Point",
            "coordinates":[data[0]['longitude'], data[0]['latitude']]
        }

    return {
        "lat": data[0]['latitude'],
        "lng": data[0]['longitude'],
        "headquarters": principal,
        "country": data[0]['country_code'],
        "city": data[0]['city'],
        
    }


main_office = df[["offices"]].apply(getFirst, result_type="expand", axis=1)

In [35]:
df_geo = pd.concat([df,main_office], axis=1)[["name","lat","lng", "country", "headquarters","number_of_employees", "category_code","founded_year","city"]]
df_geo.head()

Unnamed: 0,name,lat,lng,country,headquarters,number_of_employees,category_code,founded_year,city
0,Wetpaint,47.603122,-122.333253,USA,"{'type': 'Point', 'coordinates': [-122.333253,...",47,web,2005,Seattle
1,Zoho,37.692934,-121.904945,USA,"{'type': 'Point', 'coordinates': [-121.904945,...",1600,software,2005,Pleasanton
2,Geni,34.090368,-118.393064,USA,"{'type': 'Point', 'coordinates': [-118.393064,...",18,web,2006,West Hollywood
3,Plaxo,37.387845,-122.055197,USA,"{'type': 'Point', 'coordinates': [-122.055197,...",50,web,2002,Sunnyvale
4,eBay,37.295005,-121.930035,USA,"{'type': 'Point', 'coordinates': [-121.930035,...",15000,web,1995,San Jose


### I delete the null values, which arise from unknown location of offices

In [36]:
# Checking and Deleting Nulls of location as it is essential to know it 
df_geo = df_geo.dropna(subset=['headquarters'])


## I bin companies in either Old or New w.r.t their founded year, I consider new those from the last 5 years of the database data

In [37]:
# Binning companies with regard to OLD or NEW given the founded_year
# I acknowledge the dataset ends in 2013, so calculate 10 years from there
mpg_labels = ['Old', 'New']
cutoffs = [1989,2007,2013]
bins = pd.cut(df_geo['founded_year'],cutoffs, labels=mpg_labels)
df_geo['age']=bins
df_geo.head()
df_geo['age'].value_counts()

Old    2159
New     780
Name: age, dtype: int64

## I save it to a json and continue in Mongo Compass and the Geo-Queries_Near jupyter document

In [38]:
# Saving in JSON format 
df_geob=df_geo.copy()
df_geob.to_json('df_geob.json', orient="records")

In [39]:
df_geob.shape

(2939, 10)