In [1]:
import pandas as pd

In [2]:
#Loading data from the Github repository to colab notebook
filename = 'https://raw.githubusercontent.com/PacktWorkshops/The-Data-Science-Workshop/master/Chapter15/Dataset/crx.data'



In [3]:
# Loading the data using pandas

credData = pd.read_csv(filename,sep=",",header = None,na_values = "?")
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,+
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,+
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,+
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,+
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,+


In [4]:
# Changing the Classes to 1 & 0
credData.loc[credData[15] == '+' , 15] = 1
credData.loc[credData[15] == '-' , 15] = 0
credData.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
0,b,30.83,0.0,u,g,w,v,1.25,t,t,1,f,g,202.0,0,1
1,a,58.67,4.46,u,g,q,h,3.04,t,t,6,f,g,43.0,560,1
2,a,24.5,0.5,u,g,q,h,1.5,t,f,0,f,g,280.0,824,1
3,b,27.83,1.54,u,g,w,v,3.75,t,t,5,t,g,100.0,3,1
4,b,20.17,5.625,u,g,w,v,1.71,t,f,0,f,s,120.0,0,1


In [5]:
# Dropping all the rows with na values
newcred = credData.dropna(axis = 0)
newcred.shape

(653, 16)

In [6]:
# Seperating X and y variables

X = newcred.loc[:,0:14]
X.shape

(653, 15)

In [7]:
y = newcred.loc[:,15]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: 15, dtype: int64

In [8]:
from sklearn.model_selection import train_test_split

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=123)

**Creating processing Engine**

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [10]:
categorical_transformer = Pipeline(steps=[('onehot', OneHotEncoder(handle_unknown='ignore'))])

In [11]:
numeric_transformer = Pipeline(steps=[('scaler', StandardScaler())])

In [12]:
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns
categorical_features = X.select_dtypes(include=['object']).columns

In [13]:
from sklearn.compose import ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)])

**Dimensionality reduction after processing with Pipeline**


In [14]:
# Importing PCA library
from sklearn.decomposition import PCA

In [15]:
# Creating an estimator with both preprocessor and dimensionality reduction
estimator = Pipeline(steps=[('preprocessor', preprocessor),
                      ('dimred', PCA(10))])

In [16]:
# Fitting and transforming Train set
Xtran_train = pd.DataFrame(estimator.fit_transform(X_train))

print(Xtran_train.shape)
Xtran_train.head()


(457, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-0.456911,0.857577,-1.231989,0.902396,1.604191,-0.284921,-0.595444,0.206836,0.027712,0.742267
1,-0.758102,-1.279315,1.162158,0.397572,0.031973,1.236864,0.353098,-0.020558,0.561482,0.613476
2,0.387754,-0.022255,-0.082482,-0.524931,0.0893,0.300113,-1.25766,-0.191124,-0.376516,-0.367365
3,-0.332061,-0.636192,0.825248,0.798001,0.435375,1.377995,-0.578766,0.030524,-0.900729,0.620234
4,-1.41278,-0.707406,0.607928,0.54958,1.582078,-0.11971,0.496112,0.597986,-0.133551,0.032972


In [17]:
# Transforming test set
Xtran_test = pd.DataFrame(estimator.transform(X_test))
print(Xtran_test.shape)
Xtran_test.head()

(196, 10)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,-1.299051,0.187772,-0.23137,0.112879,-0.484604,0.369499,0.28216,1.09115,-0.062456,0.077569
1,-1.494398,-0.200785,0.231369,-0.60963,1.235941,-1.063417,0.259277,0.779575,0.086378,0.07871
2,2.829701,-0.298786,-0.099139,0.24561,0.638466,0.991274,-0.769735,0.040185,-0.614251,0.164817
3,5.259748,-0.456795,0.789554,1.150056,-0.033996,0.487041,1.095085,-0.113758,0.515659,0.520806
4,-1.31073,-0.695854,0.14146,0.215672,-0.506067,0.058389,-0.324188,0.963671,0.032933,0.043535
