In [1]:
from env import host, user, password

def get_url(db):
    return f'mysql+pymysql://{user}:{password}@{host}/{db}'

In [2]:
import pandas as pd
import util

# In a new python module, acquire.py:
def get_data(query,db):
    return pd.read_sql(query, util.get_url(db))

# get_titanic_data: returns the titanic data from the codeup data science database as a pandas data frame.
def get_titanic_data():
    query = "SELECT * FROM passengers"
    db = "titanic_db"
    return pd.read_sql(query, util.get_url(db))

# get_iris_data: returns the data from the iris_db on the codeup data science database as a pandas data frame. The returned data frame should include the actual name of the species in addition to the species_ids.

def get_iris_data():
    query = """
    SELECT * FROM measurements
    JOIN species USING (species_id)
    """
    db = "iris_db"
    return pd.read_sql(query, util.get_url(db))

ModuleNotFoundError: No module named 'util'

In [None]:
import numpy as np

from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer

def prep_iris(df):
    df.drop(columns = ["species_id","measurement_id"],inplace=True)
    df.rename(columns={"species_name":"species"}, inplace=True)
    encoder = LabelEncoder()
    
    encoder.fit(df.species)
    df.species = encoder.transform(df.species)
    return df, encoder

def prep_titanic(df):
    df.fillna(np.nan,inplace=True)

    imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
    imp_mode.fit(df[["embarked","embark_town"]])
    df[["embarked","embark_town"]] = imp_mode.transform(df[["embarked","embark_town"]])
    
    df.drop(columns="deck", inplace=True)
    
    encoder = LabelEncoder()
    df.embarked = encoder.fit_transform(df.embarked)
    scaler = MinMaxScaler()
    df[["age","fare"]] = scaler.fit_transform(df[["age","fare"]])
    return df

In [3]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, RobustScaler, MinMaxScaler

# split_my_data(df, train_pct)
def split_my_data(df, train_pct):
    train, test = train_test_split(df, train_size=train_pct, random_state=123)
    return train, test

def transform_scaler(train, test, scaler):  
    train_scaled = pd.DataFrame(scaler.transform(train), columns=train.columns.values).set_index([train.index.values])
    test_scaled = pd.DataFrame(scaler.transform(test), columns=test.columns.values).set_index([test.index.values])
    return train_scaled, test_scaled

# standard_scaler()
def standard_scaler(train, test):
    scaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(train)
    train_scaled, test_scaled = transform_scaler(train, test, scaler)

    return train_scaled, test_scaled, scaler

# scale_inverse()
def scale_inverse(train_scaled, test_scaled, scaler):
    train_unscaled = pd.DataFrame(scaler.inverse_transform(train_scaled), columns=train_scaled.columns.values).set_index([train_scaled.index.values])
    test_unscaled = pd.DataFrame(scaler.inverse_transform(test_scaled), columns=test_scaled.columns.values).set_index([test_scaled.index.values])
 
    return train_unscaled, test_unscaled

# uniform_scaler()
def uniform_scaler(train, test):
    scaler = QuantileTransformer(n_quantiles=100, output_distribution='uniform', random_state=123, copy=True).fit(train)

    train_scaled, test_scaled = transform_scaler(train, test, scaler)

    return train_scaled, test_scaled, scaler   

# gaussian_scaler()
def gaussian_scaler(train, test):
    scaler = PowerTransformer(method='yeo-johnson', standardize=False, copy=True).fit(train)

    train_scaled, test_scaled = transform_scaler(train, test, scaler)

    return train_scaled, test_scaled, scaler
 
# min_max_scaler()
def min_max_scaler(train, test):
    scaler = MinMaxScaler(copy=True, feature_range=(0,1)).fit(train)
    train_scaled, test_scaled = transform_scaler(train, test, scaler)

    return train_scaled, test_scaled, scaler
 
# iqr_robust_scaler()
def iqr_robust_scaler(train, test):
    scaler = RobustScaler(quantile_range=(25.0,75.0), copy=True, with_centering=True, with_scaling=True).fit(train)

    train_scaled, test_scaled = transform_scaler(train, test, scaler)

    return train_scaled, test_scaled, scaler  


In [4]:
import seaborn as sns
import matplotlib.pyplot as plt 

def object_subplots(df):

    features = df.columns [(df.dtypes == object) & (df.nunique() < 5)]
    
    _, ax = plt.subplots(nrows=1, ncols=len(features), figsize=(16,5))

    survival_rate = df.survived.mean()

    for i, feature in enumerate(features):
        sns.barplot(feature, 'survived', data=df, ax=ax[i], alpha=.5)
        ax[i].set_ylabel('Survival Rate')
        ax[i].axhline(survival_rate, ls='--', color='grey')
    return
