# Space flight price prediction

**Scenario**: It is 2160 and the space tourism industry is booming. Globally, thousands of space shuttle companies take tourists to the Moon and back. You have been able to source amenities offered in each space shuttle, customer reviews and company information.

**Project**: You want to construct a model that predicts the price for each trip to the Moon and the corresponding return flight.

In [1]:
import pandas as pd


In [2]:
#set paths 
companies_path='data/companies.csv'
reviews_path='data/reviews.csv'
shuttles_path='data/shuttles.xlsx'

In [3]:
companies=pd.read_csv(companies_path)

In [4]:
reviews=pd.read_csv(reviews_path)

In [5]:
shuttles=pd.read_excel(shuttles_path, engine='openpyxl')

In [6]:
companies.head()

Unnamed: 0,id,company_rating,company_location,total_fleet_count,iata_approved
0,35029,100%,Niue,4.0,f
1,30292,67%,Anguilla,6.0,f
2,19032,67%,Russian Federation,4.0,f
3,8238,91%,Barbados,15.0,t
4,30342,,Sao Tome and Principe,2.0,t


In [7]:
reviews.head()

Unnamed: 0,shuttle_id,review_scores_rating,review_scores_comfort,review_scores_amenities,review_scores_trip,review_scores_crew,review_scores_location,review_scores_price,number_of_reviews,reviews_per_month
0,63561,97.0,10.0,9.0,10.0,10.0,9.0,10.0,133,1.65
1,36260,90.0,8.0,9.0,10.0,9.0,9.0,9.0,3,0.09
2,57015,95.0,9.0,10.0,9.0,10.0,9.0,9.0,14,0.14
3,14035,93.0,10.0,9.0,9.0,9.0,10.0,9.0,39,0.42
4,10036,98.0,10.0,10.0,10.0,10.0,9.0,9.0,92,0.94


In [8]:
shuttles.head()

Unnamed: 0,id,shuttle_location,shuttle_type,engine_type,engine_vendor,engines,passenger_capacity,cancellation_policy,crew,d_check_complete,moon_clearance_complete,price,company_id
0,63561,Niue,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,f,f,"$1,325.0",35029
1,36260,Anguilla,Type V5,Quantum,ThetaBase Services,1.0,2,strict,1.0,t,f,"$1,780.0",30292
2,57015,Russian Federation,Type V5,Quantum,ThetaBase Services,1.0,2,moderate,0.0,f,f,"$1,715.0",19032
3,14035,Barbados,Type V5,Plasma,ThetaBase Services,3.0,6,strict,3.0,f,f,"$4,770.0",8238
4,10036,Sao Tome and Principe,Type V2,Plasma,ThetaBase Services,2.0,4,strict,2.0,f,f,"$2,820.0",30342


# Preprocess data

In [9]:
def _is_true(x: pd.Series) -> pd.Series:
    return x == "t"


def _parse_percentage(x: pd.Series) -> pd.Series:
    x = x.str.replace("%", "")
    x = x.astype(float) / 100
    return x


def _parse_money(x: pd.Series) -> pd.Series:
    x = x.str.replace("$", "").str.replace(",", "")
    x = x.astype(float)
    return x


def preprocess_companies(companies: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the data for companies.

    Args:
        companies: Raw data.
    Returns:
        Preprocessed data, with `company_rating` converted to a float and
        `iata_approved` converted to boolean.
    """
    companies["iata_approved"] = _is_true(companies["iata_approved"])
    companies["company_rating"] = _parse_percentage(companies["company_rating"])
    return companies


def preprocess_shuttles(shuttles: pd.DataFrame) -> pd.DataFrame:
    """Preprocesses the data for shuttles.

    Args:
        shuttles: Raw data.
    Returns:
        Preprocessed data, with `price` converted to a float and `d_check_complete`,
        `moon_clearance_complete` converted to boolean.
    """
    shuttles["d_check_complete"] = _is_true(shuttles["d_check_complete"])
    shuttles["moon_clearance_complete"] = _is_true(shuttles["moon_clearance_complete"])
    shuttles["price"] = _parse_money(shuttles["price"])
    return shuttles

# Final input data for model

In [14]:
def create_model_input_table(
    shuttles: pd.DataFrame, companies: pd.DataFrame, reviews: pd.DataFrame
) -> pd.DataFrame:
    """Combines all data to create a model input table.

    Args:
        shuttles: Preprocessed data for shuttles.
        companies: Preprocessed data for companies.
        reviews: Raw data for reviews.
    Returns:
        model input table.

    """
    rated_shuttles = shuttles.merge(reviews, left_on="id", right_on="shuttle_id")
    model_input_table = rated_shuttles.merge(
        companies, left_on="company_id", right_on="id"
    )
    model_input_table = model_input_table.dropna()
    return model_input_table

In [10]:
final_features=[ 'engines',
    'passenger_capacity',
    'crew',
    'd_check_complete',
    'moon_clearance_complete',
    'iata_approved',
    'company_rating',
    'review_scores_rating']

# Model building

In [28]:
import logging
from typing import Dict, Tuple

import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import ElasticNet
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split


def split_data(data: pd.DataFrame, parameters: Dict) -> Tuple:
    """Splits data into features and targets training and test sets.

    Args:
        data: Data containing features and target.
        parameters: Parameters defined in parameters/data_science.yml.
    Returns:
        Split data.
    """
    X = data[parameters["features"]]
    y = data["price"]
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=parameters["test_size"], random_state=parameters["random_state"]
    )
    return X_train, X_test, y_train, y_test


def train_model(X_train: pd.DataFrame, y_train: pd.Series) -> LinearRegression:
    """Trains the linear regression model.

    Args:
        X_train: Training data of independent features.
        y_train: Training data for price.

    Returns:
        Trained model.
    """
    regressor = LinearRegression()
    regressor.fit(X_train, y_train)
    return regressor


def evaluate_model(
    regressor: LinearRegression, X_test: pd.DataFrame, y_test: pd.Series
):
    """Calculates and logs the coefficient of determination.

    Args:
        regressor: Trained model.
        X_test: Testing data of independent features.
        y_test: Testing data for price.
    """
    y_pred = regressor.predict(X_test)
    score = r2_score(y_test, y_pred)
    print("Model has a coefficient R^2 of %.3f on test data.", score)

# Steps for ML

In [12]:
# step 1.2 preprocess
companies_df=preprocess_companies(companies)
shuttles_df=preprocess_shuttles(shuttles)

  x = x.str.replace("$", "").str.replace(",", "")


In [15]:
# step 1.3 data preparation for model
input_data=create_model_input_table(shuttles_df,companies_df,reviews)

In [18]:
# Step 1.4 build train-test split
parameters={'test_size':0.2,'random_state':3,'features':final_features}
X_train, X_test, y_train, y_test=split_data(input_data,parameters)

In [29]:
# Step 1.5 build regression model
lr_model=train_model(X_train,y_train)

In [30]:
evaluate_model(lr_model,X_test,y_test)

Model has a coefficient R^2 of %.3f on test data. 0.4619314668167279
