# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Bringing in the Data

In [2]:
df = pd.read_csv('https://docs.google.com/spreadsheets/d/1jfU2oFSfhX1ywUbqETExDJuztO95r3h6pbWAm7xpwNY/gviz/tq?tqx=out:csv&sheet=users')
df.head()

Unnamed: 0,sex,length,diameter,height,whole_weight,shucked_weight,viscera_weight,shell_weight,rings
0,M,0.455,0.365,0.095,0.514,0.2245,0.101,0.15,15
1,M,0.35,0.265,0.09,0.2255,0.0995,0.0485,0.07,7
2,F,0.53,0.42,0.135,0.677,0.2565,0.1415,0.21,9
3,M,0.44,0.365,0.125,0.516,0.2155,0.114,0.155,10
4,I,0.33,0.255,0.08,0.205,0.0895,0.0395,0.055,7


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4177 entries, 0 to 4176
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   sex             4177 non-null   object 
 1   length          4177 non-null   float64
 2   diameter        4177 non-null   float64
 3   height          4177 non-null   float64
 4   whole_weight    4177 non-null   float64
 5   shucked_weight  4177 non-null   float64
 6   viscera_weight  4177 non-null   float64
 7   shell_weight    4177 non-null   float64
 8   rings           4177 non-null   int64  
dtypes: float64(7), int64(1), object(1)
memory usage: 293.8+ KB


* Data is clean. lots of numeric columns. Rings will be our target. Note, sex has 3 categories.

# Separating the data into X and y

In [4]:
X = df.drop(columns = ['rings'])
y = df['rings']

# Splitting the data with train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 42)
print(f'Training data set size: {X_train.shape}')
print(f'Test data set size: {X_test.shape}')
print(f'Training target set size: {y_train.shape}')
print(f'Test target set size: {y_test.shape}')

Training data set size: (3132, 8)
Test data set size: (1045, 8)
Training target set size: (3132,)
Test target set size: (1045,)


# Creating the column transformers

## Creating column selectors

In [6]:
cat_selector = make_column_selector(dtype_include = 'object')
num_selector = make_column_selector(dtype_include = 'number')

## Creating the StandardScaler

In [7]:
scaler = StandardScaler()

## Creating the One Hot Encoder

In [8]:
ohe = OneHotEncoder(handle_unknown='ignore')

## Matching the transformers

In [9]:
# Starting with the tuples:
num_tuple = (scaler, num_selector)
cat_tuple = (ohe, cat_selector)

# Matching to the Transformer:
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

# Fitting and Transforming the Data

In [10]:
col_transformer.fit(X_train)

In [12]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

X_train_processed

array([[ 0.74929076,  0.46422584, -0.11886923, ...,  1.        ,
         0.        ,  0.        ],
       [-0.09025371, -0.14465442, -0.0016468 , ...,  1.        ,
         0.        ,  0.        ],
       [ 1.12708577,  1.22532616,  0.81891021, ...,  0.        ,
         0.        ,  1.        ],
       ...,
       [-0.13223093, -0.14465442, -0.35331409, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.41347297,  0.56570588, -0.47053652, ...,  0.        ,
         0.        ,  1.        ],
       [ 0.58138187,  0.66718592,  0.46724292, ...,  1.        ,
         0.        ,  0.        ]])

* I could be done here, but I'll create some dataframes from the resulting arrays to make the data easier to explore.

In [13]:
X_train_proc_df = pd.DataFrame(X_train_processed)
X_test_proc_df = pd.DataFrame(X_test_processed)

X_train_proc_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,0.749291,0.464226,-0.118869,0.457447,0.499098,0.743973,0.241135,1.0,0.0,0.0
1,-0.090254,-0.144654,-0.001647,-0.301655,-0.364269,-0.514040,-0.145838,1.0,0.0,0.0
2,1.127086,1.225326,0.818910,1.523852,1.692114,1.544526,1.179902,0.0,0.0,1.0
3,-0.593980,-0.449095,-1.056649,-0.651696,-0.617673,-0.738195,-0.647469,0.0,0.0,1.0
4,-0.258163,-0.093914,0.350020,-0.052352,-0.572823,-0.605532,0.785763,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...
3127,-0.300140,-0.093914,-0.587759,-0.539765,-0.476395,-0.449995,-0.683300,1.0,0.0,0.0
3128,1.211040,1.428286,1.170577,1.132090,0.808565,1.114515,1.144071,1.0,0.0,0.0
3129,-0.132231,-0.144654,-0.353314,-0.516361,-0.530215,-0.440846,-0.375155,0.0,0.0,1.0
3130,0.413473,0.565706,-0.470537,0.446253,0.689711,0.336834,-0.038345,0.0,0.0,1.0
