In [27]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import splitting and imputing functions
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer

# turn off pink boxes for demo
import warnings
warnings.filterwarnings("ignore")

# import our own acquire module
import acquire
import env


In [28]:
#Using the Iris Data:

#Use the function defined in acquire.py to load the iris data.

iris_df = acquire.get_iris_data()
iris_df.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


In [29]:
#Clean up the column names - replace the period with an underscore and lowercase.

iris_df.columns.str.replace('.','_')
iris_df.columns

Index(['species_id', 'species_name', 'sepal_length', 'sepal_width',
       'petal_length', 'petal_width'],
      dtype='object')

In [30]:
#Drop the species_id and measurement_id columns.

iris_df.drop(columns='species_id', inplace=True)

In [31]:
#Rename the species_name column to just species.

iris_df['species'] = iris_df['species_name']
new_iris_df = iris_df.drop(columns='species_name')
new_iris_df

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [32]:
#Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

def prep_iris(iris_df):
    iris_df.columns.str.replace('.','_')
    iris_df['species'] = iris_df['species_name']
    iris_df.drop(columns='species_name', inplace=True)
    prep_iris_df = iris_df
    return prep_iris_df

In [33]:
prep_iris(iris_df)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [45]:
#Using the Titanic dataset

#Use the function defined in acquire.py to load the Titanic data.
import acquire

titanic_query = 'select * from passengers'
titanic_df = acquire.get_titanic_data(titanic_query, directory, filename = 'titanic.csv')
titanic_df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


In [46]:
titanic_df.columns

Index(['passenger_id', 'survived', 'pclass', 'sex', 'age', 'sibsp', 'parch',
       'fare', 'embarked', 'class', 'deck', 'embark_town', 'alone'],
      dtype='object')

In [48]:
#Drop any unnecessary, unhelpful, or duplicated columns.

#if we're looking to predict something, we should get rid of anything that is not predictive

to_drop = ['passenger_id', 'parch', 'class', 'embarked']
clean_titanic_df = titanic_df.drop(columns = to_drop, inplace = False)
clean_titanic_df

Unnamed: 0,survived,pclass,sex,age,sibsp,fare,deck,embark_town,alone
0,0,3,male,22.0,1,7.2500,,Southampton,0
1,1,1,female,38.0,1,71.2833,C,Cherbourg,0
2,1,3,female,26.0,0,7.9250,,Southampton,1
3,1,1,female,35.0,1,53.1000,C,Southampton,0
4,0,3,male,35.0,0,8.0500,,Southampton,1
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000,,Southampton,1
887,1,1,female,19.0,0,30.0000,B,Southampton,1
888,0,3,female,,1,23.4500,,Southampton,0
889,1,1,male,26.0,0,30.0000,C,Cherbourg,1


In [49]:
#Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

def prep_titanic(titanic_df):
    titanic_df.drop(columns=to_drop, inplace=True)
    prep_titanic_df = titanic_df
    return prep_titanic_df

In [50]:
prep_titanic(titanic_df)

Unnamed: 0,survived,pclass,sex,age,sibsp,fare,deck,embark_town,alone
0,0,3,male,22.0,1,7.2500,,Southampton,0
1,1,1,female,38.0,1,71.2833,C,Cherbourg,0
2,1,3,female,26.0,0,7.9250,,Southampton,1
3,1,1,female,35.0,1,53.1000,C,Southampton,0
4,0,3,male,35.0,0,8.0500,,Southampton,1
...,...,...,...,...,...,...,...,...,...
886,0,2,male,27.0,0,13.0000,,Southampton,1
887,1,1,female,19.0,0,30.0000,B,Southampton,1
888,0,3,female,,1,23.4500,,Southampton,0
889,1,1,male,26.0,0,30.0000,C,Cherbourg,1


In [55]:
#Using the Telco dataset
import acquire
from acquire import telco_query, directory

#Use the function defined in acquire.py to load the Telco data.
telco_df = acquire.get_telco_data(telco_query, directory)
telco_df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [59]:
#Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

to_drop = ['customer_id', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'dependents', 'partner', 'multiple_lines', 'internet_service_type', 'phone_service', 'online_security', 'online_backup', 'device_protection']
clean_telco_df = telco_df.drop(columns = to_drop, inplace = False)
clean_telco_df.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,gender,senior_citizen,tenure,monthly_charges,total_charges,churn,contract_type,payment_type
0,2,1,2,Female,0,9,65.6,593.3,No,One year,Mailed check
1,2,1,1,Male,0,9,59.9,542.4,No,Month-to-month,Mailed check
2,1,2,1,Male,0,4,73.9,280.85,Yes,Month-to-month,Electronic check
3,1,2,1,Male,1,13,98.0,1237.85,Yes,Month-to-month,Electronic check
4,2,2,1,Female,1,3,83.9,267.4,Yes,Month-to-month,Mailed check


In [61]:
#Handle null values.

clean_telco_df.isna().sum()

payment_type_id             0
internet_service_type_id    0
contract_type_id            0
gender                      0
senior_citizen              0
tenure                      0
monthly_charges             0
total_charges               0
churn                       0
contract_type               0
payment_type                0
dtype: int64

In [62]:
#Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

def prep_telco(telco_df):
    to_drop = ['customer_id', 'tech_support', 'streaming_tv', 'streaming_movies', 'paperless_billing', 'dependents', 'partner', 'multiple_lines', 'internet_service_type', 'phone_service', 'online_security', 'online_backup', 'device_protection']
    telco_df.drop(columns=to_drop, inplace=True)
    prep_telco_df = telco_df
    return prep_telco_df

In [63]:
prep_telco(telco_df)

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,gender,senior_citizen,tenure,monthly_charges,total_charges,churn,contract_type,payment_type
0,2,1,2,Female,0,9,65.60,593.3,No,One year,Mailed check
1,2,1,1,Male,0,9,59.90,542.4,No,Month-to-month,Mailed check
2,1,2,1,Male,0,4,73.90,280.85,Yes,Month-to-month,Electronic check
3,1,2,1,Male,1,13,98.00,1237.85,Yes,Month-to-month,Electronic check
4,2,2,1,Female,1,3,83.90,267.4,Yes,Month-to-month,Mailed check
...,...,...,...,...,...,...,...,...,...,...,...
7038,2,1,2,Female,0,13,55.15,742.9,No,One year,Mailed check
7039,1,2,1,Male,0,22,85.10,1873.7,Yes,Month-to-month,Electronic check
7040,2,1,1,Male,0,2,50.30,92.75,No,Month-to-month,Mailed check
7041,2,1,3,Male,0,67,67.85,4627.65,No,Two year,Mailed check


In [None]:
#Split your data

In [None]:
#Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.



In [None]:
#Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.



In [None]:
#Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.



In [None]:
#Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

