# Splitting and Scaling Walkthrough

## First let's import our libraries, including the wrangle.py file that we made

In [1]:
import wrangle
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PowerTransformer, MinMaxScaler, RobustScaler

## Now lets use the wrangle_telco function to get our data

In [2]:
telco = wrangle.wrangle_telco()
telco

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.70,71,7904.25
1,0014-BMAQU,84.65,63,5377.80
2,0016-QLJIS,90.45,65,5957.90
3,0017-DINOC,45.20,54,2460.55
4,0017-IUDMW,116.80,72,8456.75
...,...,...,...,...
1690,9964-WBQDJ,24.40,71,1725.40
1691,9972-EWRJS,19.25,67,1372.90
1692,9975-GPKZU,19.75,46,856.50
1693,9993-LHIEB,67.85,67,4627.65


## Now lets work on making a function to split our data

### First lets break up our X and y and then make sure we can get a train_test_split() to work on it

In [3]:
X = telco[['monthly_charges', 'tenure']]
X

Unnamed: 0,monthly_charges,tenure
0,109.70,71
1,84.65,63
2,90.45,65
3,45.20,54
4,116.80,72
...,...,...
1690,24.40,71
1691,19.25,67
1692,19.75,46
1693,67.85,67


In [4]:
y = telco[['total_charges']]
y

Unnamed: 0,total_charges
0,7904.25
1,5377.80
2,5957.90
3,2460.55
4,8456.75
...,...
1690,1725.40
1691,1372.90
1692,856.50
1693,4627.65


In [5]:
train_pct = 0.8

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_pct, random_state=13)

In [6]:
X_train

Unnamed: 0,monthly_charges,tenure
1133,80.30,63
687,104.00,67
448,84.20,72
282,115.85,56
869,113.15,68
...,...,...
750,23.95,69
1562,85.25,72
74,85.25,66
176,92.15,69


In [7]:
X_test

Unnamed: 0,monthly_charges,tenure
1624,25.40,68
811,55.30,72
1503,25.20,39
809,24.15,56
867,24.80,40
...,...,...
615,101.15,60
762,99.70,45
529,20.05,12
569,74.00,67


In [8]:
y_train

Unnamed: 0,total_charges
1133,4995.35
687,7039.05
448,5986.55
282,6567.90
869,7856.00
...,...
750,1713.10
1562,6083.10
74,5538.35
176,6480.90


In [9]:
y_test

Unnamed: 0,total_charges
1624,1620.20
811,3983.60
1503,987.95
809,1402.25
867,1024.70
...,...
615,6067.40
762,4634.35
529,264.55
569,4868.40


### Great so now let's make a function that does the same thing

In [10]:
def split_my_data(X, y, train_pct):
    return train_test_split(X, y, train_size=train_pct, random_state=13)

In [11]:
telco_X = telco[['monthly_charges', 'tenure']]
telco_y = telco[['total_charges']]

X_train, X_test, y_train, y_test = split_my_data(telco_X, telco_y, 0.8)

In [12]:
X_train

Unnamed: 0,monthly_charges,tenure
1133,80.30,63
687,104.00,67
448,84.20,72
282,115.85,56
869,113.15,68
...,...,...
750,23.95,69
1562,85.25,72
74,85.25,66
176,92.15,69


In [13]:
X_test

Unnamed: 0,monthly_charges,tenure
1624,25.40,68
811,55.30,72
1503,25.20,39
809,24.15,56
867,24.80,40
...,...,...
615,101.15,60
762,99.70,45
529,20.05,12
569,74.00,67


In [14]:
y_train

Unnamed: 0,total_charges
1133,4995.35
687,7039.05
448,5986.55
282,6567.90
869,7856.00
...,...
750,1713.10
1562,6083.10
74,5538.35
176,6480.90


In [15]:
y_test

Unnamed: 0,total_charges
1624,1620.20
811,3983.60
1503,987.95
809,1402.25
867,1024.70
...,...
615,6067.40
762,4634.35
529,264.55
569,4868.40


## Now let's work on making a function for standard scaler

### First let's make a standard scalar that works

In [16]:
scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns.values).set_index([X_train.index.values])
train_scaled

Unnamed: 0,monthly_charges,tenure
1133,0.568628,0.327217
687,1.251050,0.554027
448,0.680925,0.837539
282,1.592262,-0.069700
869,1.514517,0.610730
...,...,...
750,-1.053925,0.667432
1562,0.711159,0.837539
74,0.711159,0.497325
176,0.909839,0.667432


In [17]:
test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns.values).set_index([X_test.index.values])
test_scaled

Unnamed: 0,monthly_charges,tenure
1624,-1.012174,0.610730
811,-0.151227,0.837539
1503,-1.017933,-1.033643
809,-1.048167,-0.069700
867,-1.029450,-0.976940
...,...,...
615,1.168987,0.157110
762,1.127235,-0.693428
529,-1.166223,-2.564610
569,0.387224,0.554027


### Great, now that that works, lets build some functions to make it work

#### Since we are repeating the process of make the scaled dataframes, lets make a function that does that.

In [18]:
def make_scaled_dataframe(scaler, data):
    return pd.DataFrame(scaler.transform(data), columns=data.columns.values).set_index([data.index.values])

In [19]:
test_scaled = make_scaled_dataframe(scaler, X_test)
test_scaled

Unnamed: 0,monthly_charges,tenure
1624,-1.012174,0.610730
811,-0.151227,0.837539
1503,-1.017933,-1.033643
809,-1.048167,-0.069700
867,-1.029450,-0.976940
...,...,...
615,1.168987,0.157110
762,1.127235,-0.693428
529,-1.166223,-2.564610
569,0.387224,0.554027


#### Great now lets use it in the standard scaler function

In [20]:
def standard_scaler(train, test):
    scaler = StandardScaler()
    scaler.fit(train)
    train_standard_scaled = make_scaled_dataframe(scaler, train)
    test_standard_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_standard_scaled, test_standard_scaled

In [21]:
scaler, X_train_scaled, X_test_scaled = standard_scaler(X_train, X_test)

In [22]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [23]:
X_train_scaled

Unnamed: 0,monthly_charges,tenure
1133,0.568628,0.327217
687,1.251050,0.554027
448,0.680925,0.837539
282,1.592262,-0.069700
869,1.514517,0.610730
...,...,...
750,-1.053925,0.667432
1562,0.711159,0.837539
74,0.711159,0.497325
176,0.909839,0.667432


In [24]:
X_test_scaled

Unnamed: 0,monthly_charges,tenure
1624,-1.012174,0.610730
811,-0.151227,0.837539
1503,-1.017933,-1.033643
809,-1.048167,-0.069700
867,-1.029450,-0.976940
...,...,...
615,1.168987,0.157110
762,1.127235,-0.693428
529,-1.166223,-2.564610
569,0.387224,0.554027


## Now let's work on making a function that lets us reverse the scaling process

### First lets make it work outside the function

In [25]:
scaler

StandardScaler(copy=True, with_mean=True, with_std=True)

In [26]:
X_train_scaled

Unnamed: 0,monthly_charges,tenure
1133,0.568628,0.327217
687,1.251050,0.554027
448,0.680925,0.837539
282,1.592262,-0.069700
869,1.514517,0.610730
...,...,...
750,-1.053925,0.667432
1562,0.711159,0.837539
74,0.711159,0.497325
176,0.909839,0.667432


In [27]:
X_test_scaled

Unnamed: 0,monthly_charges,tenure
1624,-1.012174,0.610730
811,-0.151227,0.837539
1503,-1.017933,-1.033643
809,-1.048167,-0.069700
867,-1.029450,-0.976940
...,...,...
615,1.168987,0.157110
762,1.127235,-0.693428
529,-1.166223,-2.564610
569,0.387224,0.554027


In [28]:
X_train_unscaled = pd.DataFrame(scaler.inverse_transform(X_train_scaled), columns=X_train_scaled.columns.values).set_index([X_train_scaled.index.values])
X_train_unscaled

Unnamed: 0,monthly_charges,tenure
1133,80.30,63.0
687,104.00,67.0
448,84.20,72.0
282,115.85,56.0
869,113.15,68.0
...,...,...
750,23.95,69.0
1562,85.25,72.0
74,85.25,66.0
176,92.15,69.0


In [29]:
X_train

Unnamed: 0,monthly_charges,tenure
1133,80.30,63
687,104.00,67
448,84.20,72
282,115.85,56
869,113.15,68
...,...,...
750,23.95,69
1562,85.25,72
74,85.25,66
176,92.15,69


In [30]:
X_test_unscaled = pd.DataFrame(scaler.inverse_transform(X_test_scaled), columns=X_test_scaled.columns.values).set_index([X_test_scaled.index.values])
X_test_unscaled

Unnamed: 0,monthly_charges,tenure
1624,25.40,68.0
811,55.30,72.0
1503,25.20,39.0
809,24.15,56.0
867,24.80,40.0
...,...,...
615,101.15,60.0
762,99.70,45.0
529,20.05,12.0
569,74.00,67.0


In [31]:
X_test

Unnamed: 0,monthly_charges,tenure
1624,25.40,68
811,55.30,72
1503,25.20,39
809,24.15,56
867,24.80,40
...,...,...
615,101.15,60
762,99.70,45
529,20.05,12
569,74.00,67


### Now lets make a function to do it

In [32]:
def scale_inverse(scaler, train_scaled, test_scaled):
    train_unscaled = pd.DataFrame(scaler.inverse_transform(train_scaled), columns=train_scaled.columns.values).set_index([train_scaled.index.values])
    test_unscaled = pd.DataFrame(scaler.inverse_transform(test_scaled), columns=test_scaled.columns.values).set_index([X_test_scaled.index.values])
    
    return train_unscaled, test_unscaled

In [33]:
X_train_unscaled, X_test_unscaled = scale_inverse(scaler, X_train_scaled, X_test_scaled)
X_train_unscaled

Unnamed: 0,monthly_charges,tenure
1133,80.30,63.0
687,104.00,67.0
448,84.20,72.0
282,115.85,56.0
869,113.15,68.0
...,...,...
750,23.95,69.0
1562,85.25,72.0
74,85.25,66.0
176,92.15,69.0


In [34]:
X_train

Unnamed: 0,monthly_charges,tenure
1133,80.30,63
687,104.00,67
448,84.20,72
282,115.85,56
869,113.15,68
...,...,...
750,23.95,69
1562,85.25,72
74,85.25,66
176,92.15,69


In [35]:
X_test_unscaled

Unnamed: 0,monthly_charges,tenure
1624,25.40,68.0
811,55.30,72.0
1503,25.20,39.0
809,24.15,56.0
867,24.80,40.0
...,...,...
615,101.15,60.0
762,99.70,45.0
529,20.05,12.0
569,74.00,67.0


In [36]:
X_test

Unnamed: 0,monthly_charges,tenure
1624,25.40,68
811,55.30,72
1503,25.20,39
809,24.15,56
867,24.80,40
...,...,...
615,101.15,60
762,99.70,45
529,20.05,12
569,74.00,67


## Now lets make a function for a uniform scaler

### We should be able to reuse or format for the standard scaler and just change the scalar function to a uniform scaler

In [37]:
def uniform_scaler(train, test):
    scaler = QuantileTransformer()
    scaler.fit(train)
    train_uniform_scaled = make_scaled_dataframe(scaler, train)
    test_uniform_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_uniform_scaled, test_uniform_scaled

In [38]:
scaler, X_train_uniform_scaled, X_test_uniform_scaled = uniform_scaler(X_train, X_test)
scaler

QuantileTransformer(copy=True, ignore_implicit_zeros=False, n_quantiles=1000,
                    output_distribution='uniform', random_state=None,
                    subsample=100000)

In [39]:
X_train_uniform_scaled

Unnamed: 0,monthly_charges,tenure
1133,0.635229,0.461962
687,0.838839,0.566066
448,0.674675,1.000000
282,0.984182,0.335335
869,0.951149,0.603604
...,...,...
750,0.252874,0.645646
1562,0.695696,1.000000
74,0.695696,0.536036
176,0.768590,0.645646


In [40]:
X_test_uniform_scaled

Unnamed: 0,monthly_charges,tenure
1624,0.340841,0.603604
811,0.443367,1.000000
1503,0.324713,0.159660
809,0.259259,0.335335
867,0.296296,0.164665
...,...,...
615,0.827891,0.398899
762,0.815839,0.199700
529,0.149149,0.033033
569,0.573662,0.566066


### Great, since that works we can build our last three with the same shape of function

In [41]:
def gaussian_scaler(train, test):
    scaler = PowerTransformer(method='yeo-johnson')
    scaler.fit(train)
    train_gaussian_scaled = make_scaled_dataframe(scaler, train)
    test_gaussian_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_gaussian_scaled, test_gaussian_scaled

In [42]:
scalar, X_train_gaussian_scaled, X_test_gaussian_scaled = gaussian_scaler(X_train, X_test)

In [43]:
def min_max_scaler(train, test):
    scaler = MinMaxScaler()
    scaler.fit(train)
    train_min_max_scaled = make_scaled_dataframe(scaler, train)
    test_min_max_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_min_max_scaled, test_min_max_scaled

In [44]:
scalar, X_train_min_max_scaled, X_test_min_max_scaled = min_max_scaler(X_train, X_test)

In [45]:
def iqr_robust_scaler(train, test):
    scaler = RobustScaler()
    scaler.fit(train)
    train_robust_scaled = make_scaled_dataframe(scaler, train)
    test_robust_scaled = make_scaled_dataframe(scaler, test)
    
    return scaler, train_robust_scaled, test_robust_scaled

In [46]:
scalar, X_train_robust_scaled, X_test_robust_scaled = iqr_robust_scaler(X_train, X_test)