In [1]:
import wrangle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, MinMaxScaler, RobustScaler

In [2]:
telco = wrangle.wrangle_telco()
telco

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.70,71,7904.25
1,0014-BMAQU,84.65,63,5377.80
2,0016-QLJIS,90.45,65,5957.90
3,0017-DINOC,45.20,54,2460.55
4,0017-IUDMW,116.80,72,8456.75
...,...,...,...,...
1690,9964-WBQDJ,24.40,71,1725.40
1691,9972-EWRJS,19.25,67,1372.90
1692,9975-GPKZU,19.75,46,856.50
1693,9993-LHIEB,67.85,67,4627.65


# Need to make a function to split the data

We first need to break up our X and y and make sure we can get our train_test_split() working

In [4]:
X = telco[['monthly_charges', 'tenure']]
X

Unnamed: 0,monthly_charges,tenure
0,109.70,71
1,84.65,63
2,90.45,65
3,45.20,54
4,116.80,72
...,...,...
1690,24.40,71
1691,19.25,67
1692,19.75,46
1693,67.85,67


In [5]:
y = telco[['total_charges']]
y

Unnamed: 0,total_charges
0,7904.25
1,5377.80
2,5957.90
3,2460.55
4,8456.75
...,...
1690,1725.40
1691,1372.90
1692,856.50
1693,4627.65


In [6]:
train_pct = 0.8
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_pct, random_state=13)

In [7]:
X_train

Unnamed: 0,monthly_charges,tenure
173,19.55,43
420,103.20,56
979,115.80,72
1171,20.20,69
796,101.30,57
...,...,...
742,20.40,44
1552,19.65,6
74,85.25,66
176,92.15,69


In [8]:
X_test

Unnamed: 0,monthly_charges,tenure
240,110.45,54
720,105.50,72
1517,20.15,50
1402,59.75,66
716,25.75,1
...,...,...
1118,60.40,67
400,90.10,71
1563,19.75,16
159,20.25,47


In [9]:
y_train

Unnamed: 0,total_charges
173,876.15
420,5873.75
979,8424.90
1171,1412.65
796,5779.60
...,...
742,905.55
1552,116.85
74,5538.35
176,6480.90


In [10]:
y_test

Unnamed: 0,total_charges
240,6077.75
720,7544.00
1517,970.85
1402,3996.80
716,25.75
...,...
1118,3953.70
400,6310.90
1563,284.35
159,1029.80


## Now let's make a function that does the all of that in one run

In [11]:
def split_my_data(X, y, train_pct):
    return train_test_split(X, y, train_size=train_pct, random_state=13)

In [12]:
telco_X = telco[['monthly_charges', 'tenure']]
telco_y = telco[['total_charges']]

X_train, X_test, y_train, y_test = split_my_data(telco_X, telco_y, 0.8)

In [14]:
X_train

Unnamed: 0,monthly_charges,tenure
173,19.55,43
420,103.20,56
979,115.80,72
1171,20.20,69
796,101.30,57
...,...,...
742,20.40,44
1552,19.65,6
74,85.25,66
176,92.15,69


In [15]:
X_test

Unnamed: 0,monthly_charges,tenure
240,110.45,54
720,105.50,72
1517,20.15,50
1402,59.75,66
716,25.75,1
...,...,...
1118,60.40,67
400,90.10,71
1563,19.75,16
159,20.25,47


In [16]:
y_train

Unnamed: 0,total_charges
173,876.15
420,5873.75
979,8424.90
1171,1412.65
796,5779.60
...,...
742,905.55
1552,116.85
74,5538.35
176,6480.90


In [17]:
y_test

Unnamed: 0,total_charges
240,6077.75
720,7544.00
1517,970.85
1402,3996.80
716,25.75
...,...
1118,3953.70
400,6310.90
1563,284.35
159,1029.80


# Exercises

Our scenario continues:

>As a customer analyst, I want to know who has spent the most money with us over their lifetime. I have monthly charges and tenure, so I think I will be able to use those two attributes as features to estimate total_charges. I need to do this within an average of $5.00 per customer.

Create `split_scale.py` that will contain the functions that follow. Each scaler function should create the object, fit and transform both train and test. They should return the scaler, train dataframe scaled, test dataframe scaled. Be sure your indices represent the original indices from train/test, as those represent the indices from the original dataframe. Be sure to set a random state where applicable for reproducibility!

1. `split_my_data(X, y, train_pct)`

In [4]:
def split_my_data(df, train_pct=0.70, seed=123):
    train, test = train_test_split(df, train_size=train_pct, random_state=seed)
    return train, test

In [5]:
train, test = split_my_data(df)

In [6]:
train.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
1469,8661-BOYNW,84.4,72,6096.45
163,0960-HUWBM,104.1,65,6700.05
392,2346-LOCWC,20.5,58,1191.4
1546,9114-DPSIA,81.0,72,5750.0
797,4891-NLUBA,61.45,61,3751.15


In [7]:
print(train.shape); print(test.shape)
# making sure the first number is different and the second is the same

(1186, 4)
(509, 4)


2. `standard_scaler()`

In [8]:
X_train = train[['tenure']] # first bracket selects df, second bracket selets column
X_test = test[['tenure']]
y_train = [['total_charges']]
y_test = [['total_charges']]

In [9]:
X_train.head() # confirming row numbers are random

Unnamed: 0,tenure
1469,72
163,65
392,58
1546,72
797,61


In [10]:
X_test.head()

Unnamed: 0,tenure
252,10
632,63
472,70
1029,63
910,71


In [11]:
def standard_scaler(X_train, X_test):
    scaler = StandardScaler().fit(X_train)
    X_train_scaled = (pd.DataFrame(scaler.transform(X_train), 
                      columns=X_train.columns, 
                      index=X_train.index))
    X_test_scaled = (pd.DataFrame(scaler.transform(X_test), 
                     columns=X_test.columns,
                     index=X_test.index))
    return scaler, X_train_scaled, X_test_scaled

In [18]:
# testing the function- however you need to import split_scale
scaler, X_train_scaled, X_test_scaled = split_scale.standard_scaler(X_train, X_test)

In [13]:
scaler = StandardScaler(copy=True, with_mea`n=True, with_std=True)

In [14]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

3. `scale_inverse()`

In [None]:
def scale_inverse(scaler, train_scaled, test_scaled):
    train_unscaled = pd.DataFrame(scaler.inverse_transform(train_scaled), columns=train_scaled.columns.values).set_index([train_scaled.index.values])
    test_unscaled = pd.DataFrame(scaler.inverse_transform(test_scaled), columns=test_scaled.columns.values).set_index([X_test_scaled.index.values])
    
    return train_unscaled, test_unscaled

4. `uniform_scaler()`

In [None]:
def uniform_scaler(train, test):
    scaler = QuantileTransformer()
    scaler.fit(train)
    train_uniform_scaled = make_scaled_dataframe(scaler, train)
    test_uniform_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_uniform_scaled, test_uniform_scaled

5. `gaussian_scaler()`

In [None]:
def gaussian_scaler(train, test):
    scaler = PowerTransformer(method='yeo-johnson')
    scaler.fit(train)
    train_gaussian_scaled = make_scaled_dataframe(scaler, train)
    test_gaussian_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_gaussian_scaled, test_gaussian_scaled

6. `min_max_scaler()`

In [None]:
def min_max_scaler(train, test):
    scaler = MinMaxScaler()
    scaler.fit(train)
    train_min_max_scaled = make_scaled_dataframe(scaler, train)
    test_min_max_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_min_max_scaled, test_min_max_scaled

7. `iqr_robust_scaler()`

In [None]:
def iqr_robust_scaler(train, test):
    scaler = RobustScaler()
    scaler.fit(train)
    train_robust_scaled = make_scaled_dataframe(scaler, train)
    test_robust_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_robust_scaled, test_robust_scaled