## Using the Iris Data:

### 1. Use the function defined in `acquire.py` to load the iris data.


In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
import scipy as sp
from pydataset import data
from env import user, password, host

import warnings
warnings.filterwarnings("ignore")

import acquire as acq

In [2]:
iris_df = acq.get_iris_data()
iris_df.head()

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2


### 2. lean up the column names - replace the period with an underscore and lowercase.


In [3]:
iris_df

Unnamed: 0,species_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,setosa,5.1,3.5,1.4,0.2
1,1,setosa,4.9,3.0,1.4,0.2
2,1,setosa,4.7,3.2,1.3,0.2
3,1,setosa,4.6,3.1,1.5,0.2
4,1,setosa,5.0,3.6,1.4,0.2
5,1,setosa,5.4,3.9,1.7,0.4
6,1,setosa,4.6,3.4,1.4,0.3
7,1,setosa,5.0,3.4,1.5,0.2
8,1,setosa,4.4,2.9,1.4,0.2
9,1,setosa,4.9,3.1,1.5,0.1


### 3. Drop the `species_id` and `measurement_id` columns.


In [4]:
# only had to drop species_id since our initial query did not specify to bring the measurement_id
iris_df = iris_df.drop(columns=['species_id'])
iris_df.head()

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


### 4. Rename the `species_name` column to just species.


In [5]:
iris_df = iris_df.rename(columns={'species_name': 'species'})
iris_df.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


### 5. Create a function named `prep_iris` that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [6]:
def prep_iris(df):
    '''
    Takes in a iris dataframe and returns a cleaned dataframe
    Arguments: df - a pandas dataframe with the expected feature names and columns
    Return: prep_iris - a dataframe with the cleaning operations performed on it
    '''
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Drop columns 
    columns_to_drop = ['species_id']
    df = df.drop(columns = columns_to_drop)
    df = df.rename(columns={'species_name': 'species'})
    return df

## Using the Titanic dataset

### 1. Use the function defined in `acquire.py` to load the Titanic data.

In [7]:
titanic_df = acq.get_titanic_data()
titanic_df.head()

Unnamed: 0.1,Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


### 2. Drop any unnecessary, unhelpful, or duplicated columns.

In [8]:
to_drop = ['class', 'embarked', 'deck', 'passenger_id', 'age', "Unnamed: 0"]
titanic_df.drop(columns = to_drop, inplace=True)
titanic_df.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.25,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.925,Southampton,1
3,1,1,female,1,0,53.1,Southampton,0
4,0,3,male,0,0,8.05,Southampton,1


### 3. Create a function named `prep_titanic` that accepts the raw titanic data, and returns the data with the transformations above applied.

In [55]:
def prep_titanic(df):
    '''
    Takes in a titanic dataframe and returns a cleaned dataframe
    Arguments: df - a pandas dataframe with the expected feature names and columns
    Return: clean_df - a dataframe with the cleaning operations performed on it
    '''
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Drop columns 
    columns_to_drop = ['embarked', 'class', 'passenger_id', 'deck', "Unnamed: 0"]
    df = df.drop(columns = columns_to_drop)
    # encoded categorical variables
    dummy_df = pd.get_dummies(df[['sex', 'class', 'embark_town']], dummy_na=False, drop_first=[True, True])
    df = pd.concat([df, dummy_df], axis=1)
    return df

## Using the Telco dataset

### 1. Use the function defined in `acquire.py` to load the Telco data.

In [10]:
telco_df = acq.get_telco_data()
telco_df.head()

Unnamed: 0.1,Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,1,2,1,1,0003-MKNFE,Male,0,No,No,9,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,2,1,2,1,0004-TLHLJ,Male,0,No,No,4,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


### 2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

In [11]:
to_drop = ['Unnamed: 0', 'contract_type_id', 'internet_service_type_id', 'payment_type_id', 'customer_id']
telco_df.drop(columns = to_drop, inplace=True)
telco_df.head()

Unnamed: 0,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,device_protection,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,Female,0,Yes,Yes,9,Yes,No,No,Yes,No,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,Male,0,No,No,9,Yes,Yes,No,No,No,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,Male,0,No,No,4,Yes,No,No,No,Yes,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,Male,1,Yes,No,13,Yes,No,No,Yes,Yes,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,Female,1,Yes,No,3,Yes,No,No,No,No,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


In [12]:
telco_df.select_dtypes(include='object').columns.to_list()

['gender',
 'partner',
 'dependents',
 'phone_service',
 'multiple_lines',
 'online_security',
 'online_backup',
 'device_protection',
 'tech_support',
 'streaming_tv',
 'streaming_movies',
 'paperless_billing',
 'total_charges',
 'churn',
 'contract_type',
 'internet_service_type',
 'payment_type']

In [13]:
telco_df.select_dtypes(exclude='object').columns.to_list()

['senior_citizen', 'tenure', 'monthly_charges']

### 3. Handle null values.

In [14]:
[telco_df.isnull().sum() > 0]

[gender                   False
 senior_citizen           False
 partner                  False
 dependents               False
 tenure                   False
 phone_service            False
 multiple_lines           False
 online_security          False
 online_backup            False
 device_protection        False
 tech_support             False
 streaming_tv             False
 streaming_movies         False
 paperless_billing        False
 monthly_charges          False
 total_charges            False
 churn                    False
 contract_type            False
 internet_service_type    False
 payment_type             False
 dtype: bool]

In [15]:
telco_df.isnull().sum()

gender                   0
senior_citizen           0
partner                  0
dependents               0
tenure                   0
phone_service            0
multiple_lines           0
online_security          0
online_backup            0
device_protection        0
tech_support             0
streaming_tv             0
streaming_movies         0
paperless_billing        0
monthly_charges          0
total_charges            0
churn                    0
contract_type            0
internet_service_type    0
payment_type             0
dtype: int64

### 4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [56]:
def prep_telco(df):
    '''
    Takes in a telco dataframe and returns a cleaned dataframe
    Arguments: df - a pandas dataframe with the expected feature names and columns
    Return: clean_df - a dataframe with the cleaning operations performed on it
    '''
    # Drop duplicates
    df.drop_duplicates(inplace=True)
    # Drop columns 
    columns_to_drop = ['Unnamed: 0', 'contract_type_id', 'internet_service_type_id', 'payment_type_id']
    df = df.drop(columns = columns_to_drop)
    
    # df
    
    return df

## Split your data
### 1. Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.

In [45]:
def split_data_iris(df):
    seed = 3333
    train, test = train_test_split(df,
                               train_size = 0.8,
                               stratify = df.species,
                               random_state=seed)
    train, validate = train_test_split(train,
                                  train_size = 0.75,
                                  stratify = train.species,
                                  random_state=seed)
    return train, validate, test

In [39]:
imputer = SimpleImputer(strategy='most_frequent', missing_values=np.nan)

### 2. Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.

In [46]:
train_iris, validate_iris, test_iris = split_data_iris(iris_df)

In [47]:
train_iris.shape, validate_iris.shape, test_iris.shape

((90, 5), (30, 5), (30, 5))

### 3. Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.

In [51]:
def split_data_titanic(df):
    seed = 3333
    train, test = train_test_split(df,
                               train_size = 0.8,
                               stratify = df.survived,
                               random_state=seed)
    train, validate = train_test_split(train,
                                  train_size = 0.75,
                                  stratify = train.survived,
                                  random_state=seed)
    return train, validate, test

In [49]:
train_titanic, validate_titanic, test_titanic = split_data_titanic(titanic_df)

In [50]:
train_titanic.shape, validate_titanic.shape, test_titanic.shape

((534, 8), (178, 8), (179, 8))

### 4. Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

In [52]:
def split_data_telco(df):
    seed = 3333
    train, test = train_test_split(df,
                               train_size = 0.8,
                               stratify = df.churn,
                               random_state=seed)
    train, validate = train_test_split(train,
                                  train_size = 0.75,
                                  stratify = train.churn,
                                  random_state=seed)
    return train, validate, test

In [53]:
train_telco, validate_telco, test_telco = split_data_telco(telco_df)

In [54]:
train_telco.shape, validate_telco.shape, test_telco.shape

((4225, 20), (1409, 20), (1409, 20))