In [292]:
! pip install elasticsearch --quiet

[0m

In [293]:
!pip install pandas

[0m

In [294]:
! pip install firebase-admin faker --quiet

[0m

In [295]:
from datetime import datetime
import pandas as pd

# Load Firebasedata

In [296]:
import firebase_admin
from firebase_admin import credentials, firestore
from faker import Faker

In [297]:
cred = credentials.Certificate('serviceAccount.json')
firebase_admin.initialize_app(cred)

ValueError: The default Firebase app already exists. This means you called initialize_app() more than once without providing an app name as the second argument. In most cases you only need to call initialize_app() once. But if you do want to initialize multiple apps, pass a second argument to initialize_app() to give each app a unique name.

In [None]:
#get list of collections
db = firestore.client()
collections = [x.id for x in db.collections()]
print(collections)

In [None]:
master_classes = db.collection("Masterclasses").get()
users = db.collection("Users").get()
coaches = db.collection("Coaches").get()
events = db.collection("Events").get()
blogs = db.collection("Blogs").get()
snax = db.collection("Snax").get()

#convert to json
master_classes = [x.to_dict() for x in master_classes]
users = [x.to_dict() for x in users]
coaches = [x.to_dict() for x in coaches]
events = [x.to_dict() for x in events]
blogs = [x.to_dict() for x in blogs]
snax = [x.to_dict() for x in snax]

# Filter out Firebase fields

### Users

In [None]:
len(users)

In [None]:
#remove fields email, firstName, lifetime, refACID, role, surname from users
for user in users:
    user.pop('email', None)
    user.pop('firstName', None)
    user.pop('lifetime', None)
    user.pop('refACID', None)
    user.pop('role', None)
    user.pop('surname', None)

### Coaches

In [None]:
#remove about, blogs, books, facebook, image, instagram, link,linkedin, podcasts from coaches
for coach in coaches:
    coach.pop('about', None)
    coach.pop('blogs', None)
    coach.pop('books', None)
    coach.pop('facebook', None)
    coach.pop('image', None)
    coach.pop('instagram', None)
    coach.pop('link', None)
    coach.pop('linkedin', None)
    coach.pop('podcasts', None)

### Masterclasses

In [None]:
#remove fields, courseLessons, courseOverview, Description, filterTags, Progress, videoURL from master_classes
for master_class in master_classes:
    master_class.pop('courseLessons', None)
    master_class.pop('courseOverview', None)
    master_class.pop('Description', None)
    master_class.pop('filterTags', None)
    master_class.pop('Progress', None)
    master_class.pop('videoURL', None)

# Calculate additional metrics

# Write data to elastic

We will recalculate aggregate metrics

### Users

In [298]:
print(users[10]['Interest'])

[{'score': 44, 'classroom': 'Job'}, {'score': 77, 'classroom': 'Geld für morgen'}, {'score': 100, 'classroom': 'Partnerschaft'}, {'score': 57, 'classroom': 'Gründung'}, {'score': 77, 'classroom': 'Familie'}, {'score': 75, 'classroom': 'Investieren'}, {'score': 71, 'classroom': 'Ausbildung'}]


Total Watchtime

In [299]:
users[1]["Watched"]

[{'Masterclass': 5, 'progress': '154 min'},
 {'Masterclass': 6, 'progress': '470 min'}]

In [300]:
#Total Watchtime
#itertae through each users and calculate total watchtime by aggregating the Progress propertie of Watched
for user in users:
    watchtime = 0
    for watched in user['Watched']:
        watchtime += int(watched['progress'].split(" ")[0])
    user['TotalWatchtime'] = watchtime

Courses enrolled

In [301]:
# for each users go through events and filter Event-Type = enroll_course and calculate total enrollments
for user in users:
    enrollments = 0
    for event in events:
        if event['Event-Type'] == 'enroll_course':
            if event['User-Id'] == user['id']:
                enrollments += 1
    user['Age'] = datetime.today().year - int(user['Birthdate'].split("/")[0])
    user['Courses_enrolled'] = enrollments

Courses completed

In [302]:
# for each users go through events and filter Event-Type = complete_course and calculate "Courses_completed"
for user in users:
    courses_completed = 0
    for event in events:
        if event['Event-Type'] == 'complete_course':
            if event['User-Id'] == user['id']:
                courses_completed += 1
    user['Courses_completed'] = courses_completed

Courses saved

In [303]:
# for each users go through events and filter Event-Type = save_course and calculate "Courses_saved"
for user in users:
    courses_saved = 0
    for event in events:
        if event['Event-Type'] == 'save_course':
            if event['User-Id'] == user['id']:
                courses_saved += 1
    user['Courses_saved'] = courses_saved

Create Geospatial Index

In [304]:
geotable = 'plz_geocoord.csv'

df = pd.read_csv(geotable)
df.set_index('plz', inplace=True)
df.head()

Unnamed: 0_level_0,lat,lng
plz,Unnamed: 1_level_1,Unnamed: 2_level_1
1067,51.05755,13.717065
1069,51.039135,13.737675
1097,51.065908,13.736152
1099,51.087188,13.802804
1108,51.144324,13.799706


Create an index with 
mapping index_name = 'your_index' mapping = {
     "mappings": 
        {         
            "properties": 
                {             "location": {                 "type": "geo_point"             },             "other_field": {                 "type": "text"             }             # Add other fields as needed         }     } }  es.indices.create(index=index_name, body=mapping, ignore=400)  # Ignore 400 already exists  # Index a document with a geospatial location doc = {     "location": {         "lat": 40.7128,         "lon": -74.0060     },     "other_field": "some_value"     # Add other fields as needed }

In [312]:
for user in users:
    if user['postalCode'] not in df.index:
       print(user['postalCode'])
       continue
       
    postalcode = int(str(user['postalCode']))
    
    user['location'] = { 
                #"type": "Point",
                "lat": df.loc[postalcode]['lat'],
                "long": df.loc[postalcode]['lng']
                }

### Events

In [306]:
for event in events:
    date = datetime.strptime(event['Timestamp'], '%Y-%m-%d').date()
    event['Year'] = date.year
    event['Month'] = date.month
    event['Day'] = date.day

Average session duration (maybe remove)

It is important to note that here we can calculate additional field metrics

# Move data to elastic

In [316]:

from elasticsearch import Elasticsearch
# Found in the 'Manage this deployment' page
CLOUD_ID = "IP:dXMtY2VudHJhbDEuZ2NwLmNsb3VkLmVzLmlvOjQ0MyRkZTFjZjM2NjdkN2E0OTg2YTE1NDgzYTVkZmU5YmJkMyQwYTdhYWUyN2JhMmI0NWQ2YTVmOTVjNmM2ZjUxZDMzOA=="
# Found in the 'Management' page under the section 'Security'
API_KEY = "WmlxVFBvd0JGclg2OUZEY2NIZkU6cG5hSXQ2bjVTczY5eWl5VThLYzFaZw=="
# Create the client instance
client = Elasticsearch(
    cloud_id=CLOUD_ID,
    api_key=API_KEY,
)
          

In [317]:
#write ['Blogs', 'Coaches', 'Events', 'Masterclasses', 'Snax', 'Users'] to elastic search
for user in users:
    client.index(index="users", body=user)


In [311]:

for coach in coaches:
    client.index(index="coaches", body=coach)
for event in events:
    client.index(index="events", body=event)
for master_class in master_classes:
    client.index(index="masterclasses", body=master_class)
for blog in blogs:
    client.index(index="blogs", body=blog)
for snak in snax:
    client.index(index="snax", body=snak)

In [315]:
client.options(ignore_status=[400,404]).indices.delete(index='users')

ObjectApiResponse({'acknowledged': True})