In [1]:
from pandas import json_normalize
from pymongo import MongoClient
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import numpy as np
import pprint

In [2]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = MongoClient(course_cluster_uri)

In [3]:
titanic = course_client['coursera-agg']['titanic']

In [4]:
# Replace {} with a stage to determine the possible values for gender.
unique_gender_stage = {
    "$group" : {
        "_id" : "$Null",
        "genders" : { "$addToSet" : "$gender"}
    }
}

In [5]:
possible_gender_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_gender_stage
])

In [6]:
# Print the distinct list of values for the gender field
pprint.pprint(list(possible_gender_values))

[{'_id': None, 'genders': ['male', 'female']}]


In [7]:
# Replace {} with a stage to determine the possible values for point_of_embarkation
unique_point_of_embarkation_stage = {
    "$group" : {
        "_id" : "$Null",
        "point_of_embarkations" : { "$addToSet" : "$point_of_embarkation"}
    }    
}

In [8]:
possible_point_of_embarkation_values = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    unique_point_of_embarkation_stage
])

In [9]:
# Print the distinct list of values for the point_of_embarkation field
pprint.pprint(list(possible_point_of_embarkation_values))

[{'_id': None, 'point_of_embarkations': ['S', 'C', 'Q']}]


In [10]:
# Given the possible values for point_of_embarkation and gender replace {} with a stage that
# will convert those field values to an integer.
# e.g., For the gender field convert 'female' to 0 and 'male' to 1
gender_and_point_of_embarkation_conversion_stage = {
    '$addFields' :{
        'gender' : {
            '$switch':{
                "branches" : [
                    { 'case' : { '$eq' : ['$gender', 'female'] }, 'then' : 0 },
                    { 'case' : { '$eq' : ['$gender', 'male'] }, 'then' : 1 }]
            }
        },
        'point_of_embarkation' : {
            '$switch':{
                "branches" : [
                    { 'case' : { '$eq' : ['$point_of_embarkation', 'C'] }, 'then' : 0 },
                    { 'case' : { '$eq' : ['$point_of_embarkation', 'S'] }, 'then' : 1 },                    
                    { 'case' : { '$eq' : ['$point_of_embarkation', 'Q'] }, 'then' : 2 }]
            }
        }
    }
}

In [11]:
cursor = titanic.aggregate([
    {
        "$match": {
            "age": {"$type": "number"},
            "point_of_embarkation": {"$ne": ""}
        }
    },
    gender_and_point_of_embarkation_conversion_stage,
    {
        "$project": {
            "_id": 0,
            "ticket_number": 0,
            "name": 0,
            "passenger_id": 0,
            "cabin": 0,
        }
    }
])

In [12]:
# Exhaust our cursor into a list
titanic_data = list(cursor)

In [13]:
# Load our dataset into a DataFrame
df = json_normalize(titanic_data)

In [14]:
df.head()

Unnamed: 0,survived,class,gender,age,siblings_spouse,parents_children,fare_paid,point_of_embarkation
0,0,1,1,54.0,0,0,51.8625,1
1,0,2,1,21.0,0,0,73.5,1
2,1,3,0,17.0,4,2,7.925,1
3,0,1,1,45.0,1,0,83.475,1
4,1,2,0,17.0,0,0,10.5,1


In [15]:
# Pull out the survived column (only the data we want to correlate against)
df_x = df.drop(['survived'], axis=1)

In [16]:
df_x.head()

Unnamed: 0,class,gender,age,siblings_spouse,parents_children,fare_paid,point_of_embarkation
0,1,1,54.0,0,0,51.8625,1
1,2,1,21.0,0,0,73.5,1
2,3,0,17.0,4,2,7.925,1
3,1,1,45.0,1,0,83.475,1
4,2,0,17.0,0,0,10.5,1


In [17]:
# Only the survived column (the value we want to predict)
df_y = df['survived']
# df_y = df.filter(items=['survived'])

In [18]:
# Create a Least Squares Linear Regression object
reg = linear_model.LinearRegression()

In [19]:
# Split our dataset into a training set (80%) and a test set (20%)
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.1, random_state=0)

In [21]:
# Fit a linear model to our training data
reg.fit(x_train, y_train)

LinearRegression()

In [22]:
# Check our test set against our trained linear model
reg.predict(x_test)

array([ 1.06628374,  0.32940183,  0.71389642,  0.2015889 ,  0.31756326,
        0.15650059,  0.3727197 ,  0.14987115,  0.95498217,  0.22328388,
        0.80636573,  1.00366487,  0.01861426,  0.08140071,  0.50549204,
        0.56016882,  0.4147821 ,  0.13447484,  0.3620466 ,  0.06895507,
        0.06267401,  0.11301588,  0.1328493 ,  1.09786551,  0.07519731,
        0.22988136,  0.12509634,  0.1050614 ,  0.46710367,  0.11261187,
        0.46811984,  0.39527251,  0.07774353,  0.34125669,  0.40277506,
        0.51624027,  0.36960724,  0.14997083,  0.03770508, -0.01224326,
        0.90365757,  0.78871507,  0.04787175,  0.62047398,  0.37584032,
        0.62186403,  0.36729406,  0.09704512,  0.28600652,  0.08180471,
        0.15781823,  0.3418863 ,  0.97288029,  0.39357244,  0.18748824,
        0.12522599,  0.47720443,  0.08724323,  0.85775026,  0.48770207,
        0.44175667,  0.08765029,  0.33433823,  0.12809198,  0.87573124,
        0.1001274 ,  0.4508335 ,  0.41632071,  0.78880322, -0.04

In [23]:
# Calculate mean squared error (should be ~0.13-0.15%)
mean_squared_error(y_test, reg.predict(x_test))

0.14290176888242412

In [24]:
# age: 25,
# class: 1,
# fare_paid: 45,
# gender: Y, (replace Y with the integer you assigned for 'male')
# parents_children: 0,
# point_of_embarkation: Z, (replace Z with the integer you assigned for 'C')
# siblings_spouse: 1

fake_passenger = [[1, 1, 25, 1, 0, 45, 0]]

In [25]:
# Use this output to verify your completion of this exercise
reg.predict(fake_passenger)

array([0.53909756])