# DIMENSIONALITY REDUCTION & DATA PROJECTION WITH SAGEMAKER

## <<< DATA PREPROCESSING >>>

### 1 - SET DATA DIRECTORY

In [1]:
DATA_DIR = "./data/"
dataset_abalone = DATA_DIR + "abalone.csv"
dataset_auto_mpg = DATA_DIR + "auto-mpg.csv"

### 2 - IMPORT LIBRARIES

In [2]:
# Import scientific's libraries
import io
import time
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

!pip install -U pandas-profiling
from pandas_profiling import ProfileReport

# Import Machine Learning's libraries
import sklearn as sk
from sklearn import manifold
from sklearn import datasets
from sklearn import decomposition
from sklearn import ensemble
from sklearn import discriminant_analysis
from sklearn import random_projection
from sklearn import neighbors
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

# Print ML's libraries version
print("\nSKLEARN VERSION\n", sk.__version__)
print("\nNUMPY VERSION\n", np.__version__)
print("\nPANDAS VERSION\n", pd.__version__)

Requirement already up-to-date: pandas-profiling in d:\iabd\softwares\anaconda3_2019-10\envs\tf2gpu\lib\site-packages (2.8.0)



SKLEARN VERSION
 0.22.1

NUMPY VERSION
 1.17.4

PANDAS VERSION
 1.0.5


### 3 - METADATA

In [3]:
df_auto_mpg = pd.read_csv(dataset_auto_mpg, names=["Mpg", 
                                                   "Cylinders", 
                                                   "Displacement", 
                                                   "Horsepower", 
                                                   "Weight", 
                                                   "Acceleration", 
                                                   "Model year", 
                                                   "Origin", 
                                                   "Car name"])

In [4]:
df_auto_mpg

Unnamed: 0,Mpg,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model year,Origin,Car name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino
...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger


In [5]:
df_auto_mpg['Pollute'] = np.nan
df_auto_mpg

Unnamed: 0,Mpg,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model year,Origin,Car name,Pollute
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu,
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320,
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite,
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst,
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino,
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140.0,86.0,2790.0,15.6,82,1,ford mustang gl,
394,44.0,4,97.0,52.0,2130.0,24.6,82,2,vw pickup,
395,32.0,4,135.0,84.0,2295.0,11.6,82,1,dodge rampage,
396,28.0,4,120.0,79.0,2625.0,18.6,82,1,ford ranger,


In [6]:
df_auto_mpg.astype({"Displacement": int, 
                    "Horsepower": int,
                    "Weight" : int,
                    "Acceleration": int})

Unnamed: 0,Mpg,Cylinders,Displacement,Horsepower,Weight,Acceleration,Model year,Origin,Car name,Pollute
0,18.0,8,307,130,3504,12,70,1,chevrolet chevelle malibu,
1,15.0,8,350,165,3693,11,70,1,buick skylark 320,
2,18.0,8,318,150,3436,11,70,1,plymouth satellite,
3,16.0,8,304,150,3433,12,70,1,amc rebel sst,
4,17.0,8,302,140,3449,10,70,1,ford torino,
...,...,...,...,...,...,...,...,...,...,...
393,27.0,4,140,86,2790,15,82,1,ford mustang gl,
394,44.0,4,97,52,2130,24,82,2,vw pickup,
395,32.0,4,135,84,2295,11,82,1,dodge rampage,
396,28.0,4,120,79,2625,18,82,1,ford ranger,


In [7]:
print(df_auto_mpg['Mpg'].median())

for i in df_auto_mpg['Mpg']:
    if i >= df_auto_mpg['Mpg'].median():
        df_auto_mpg['Pollute'] = 1

        print(i, 'Pollue', df_auto_mpg['Pollute'][i])
    elif i < df_auto_mpg['Mpg'].median():
        df_auto_mpg['Pollute'] = 0
        print(i, 'NE POLLUE PAS !', df_auto_mpg['Pollute'][i])
    else:
        print("ERROR!")
    

23.0
18.0 NE POLLUE PAS ! 0
15.0 NE POLLUE PAS ! 0
18.0 NE POLLUE PAS ! 0
16.0 NE POLLUE PAS ! 0
17.0 NE POLLUE PAS ! 0
15.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
15.0 NE POLLUE PAS ! 0
15.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
15.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
24.0 Pollue 1
22.0 NE POLLUE PAS ! 0
18.0 NE POLLUE PAS ! 0
21.0 NE POLLUE PAS ! 0
27.0 Pollue 1
26.0 Pollue 1
25.0 Pollue 1
24.0 Pollue 1
25.0 Pollue 1
26.0 Pollue 1
21.0 NE POLLUE PAS ! 0
10.0 NE POLLUE PAS ! 0
10.0 NE POLLUE PAS ! 0
11.0 NE POLLUE PAS ! 0
9.0 NE POLLUE PAS ! 0
27.0 Pollue 1
28.0 Pollue 1
25.0 Pollue 1
25.0 Pollue 1
19.0 NE POLLUE PAS ! 0
16.0 NE POLLUE PAS ! 0
17.0 NE POLLUE PAS ! 0
19.0 NE POLLUE PAS ! 0
18.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
14.0 NE POLLUE PAS ! 0
12.0 NE POLLUE PAS ! 0
13.0 NE POLLUE PAS ! 0
13.0 NE POLLUE PAS ! 0
18.0 NE POLLUE PAS ! 0
22.0 NE POLLUE PAS ! 0
19.0 NE POLLUE

TypeError: cannot do label indexing on <class 'pandas.core.indexes.range.RangeIndex'> with these indexers [17.5] of <class 'float'>

In [None]:
one = 0
zero = 0

for i in df_auto_mpg['Pollute']:
    if i == 1:
        one += 1
    elif i == 0:
        zero += 1
        
print("Pollute (1):", one)
print("Don't Pollute (0):", zero)

In [None]:
df_auto_mpg

In [None]:
profile_df_auto_mpg = ProfileReport(df_auto_mpg, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile_df_auto_mpg.to_widgets()

### 1 - Analyze Dataset

In [8]:
df_abalone = pd.read_csv(dataset_abalone, names=["Sex", 
                                                   "Length", 
                                                   "Diameter", 
                                                   "Height", 
                                                   "Whole weight", 
                                                   "Shucked weight", 
                                                   "Viscera weight", 
                                                   "Shell weight", 
                                                   "Rings"])

In [9]:
df_abalone

Unnamed: 0,Sex,Length,Diameter,Height,Whole weight,Shucked weight,Viscera weight,Shell weight,Rings
0,M,0.455,0.365,0.095,0.5140,0.2245,0.1010,0.1500,15
1,M,0.350,0.265,0.090,0.2255,0.0995,0.0485,0.0700,7
2,F,0.530,0.420,0.135,0.6770,0.2565,0.1415,0.2100,9
3,M,0.440,0.365,0.125,0.5160,0.2155,0.1140,0.1550,10
4,I,0.330,0.255,0.080,0.2050,0.0895,0.0395,0.0550,7
...,...,...,...,...,...,...,...,...,...
4172,F,0.565,0.450,0.165,0.8870,0.3700,0.2390,0.2490,11
4173,M,0.590,0.440,0.135,0.9660,0.4390,0.2145,0.2605,10
4174,M,0.600,0.475,0.205,1.1760,0.5255,0.2875,0.3080,9
4175,F,0.625,0.485,0.150,1.0945,0.5310,0.2610,0.2960,10


In [10]:
profile_df_abalone = ProfileReport(df_abalone, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile_df_abalone.to_widgets()

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=23.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### 2 - Remove High correlation

In [11]:
df_abalone = df_abalone.drop(['Length', 'Whole weight', 'Viscera weight', 'Shell weight'], axis=1)

We'll drop **Length, Whole weight, Viscera weight** and **Shell weight** because there are too much **high correlation** between these features in our dataset.

In [12]:
df_abalone

Unnamed: 0,Sex,Diameter,Height,Shucked weight,Rings
0,M,0.365,0.095,0.2245,15
1,M,0.265,0.090,0.0995,7
2,F,0.420,0.135,0.2565,9
3,M,0.365,0.125,0.2155,10
4,I,0.255,0.080,0.0895,7
...,...,...,...,...,...
4172,F,0.450,0.165,0.3700,11
4173,M,0.440,0.135,0.4390,10
4174,M,0.475,0.205,0.5255,9
4175,F,0.485,0.150,0.5310,10


In [13]:
profile_df_abalone = ProfileReport(df_abalone, title='Pandas Profiling Report', html={'style':{'full_width':True}})
profile_df_abalone.to_widgets()

HBox(children=(FloatProgress(value=0.0, description='Summarize dataset', max=19.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=0.0, description='Generate report structure', max=1.0, style=ProgressStyle(…




HBox(children=(FloatProgress(value=0.0, description='Render widgets', max=1.0, style=ProgressStyle(description…

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

### 4 - Split dataset in matrix (x) & vector (y)

In [15]:
X = df_abalone.drop(['Rings'], axis=1)
y = df_abalone['Rings']

In [16]:
X

Unnamed: 0,Sex,Diameter,Height,Shucked weight
0,M,0.365,0.095,0.2245
1,M,0.265,0.090,0.0995
2,F,0.420,0.135,0.2565
3,M,0.365,0.125,0.2155
4,I,0.255,0.080,0.0895
...,...,...,...,...
4172,F,0.450,0.165,0.3700
4173,M,0.440,0.135,0.4390
4174,M,0.475,0.205,0.5255
4175,F,0.485,0.150,0.5310


In [17]:
y

0       15
1        7
2        9
3       10
4        7
        ..
4172    11
4173    10
4174     9
4175    10
4176    12
Name: Rings, Length: 4177, dtype: int64

### 3 - Label encode: Sex (var categorical)

In [24]:
labelencoder_X = LabelEncoder()
print(df_abalone.iloc[:, 0])

X[:, 0] = labelencoder_X.fit_transform(X[:, 0])
# onehotencoder = OneHotEncoder(categorical_features = [3])
# X = onehotencoder.fit_transform(X).toarray()
# X = X[:, 1:]

0       M
1       M
2       F
3       M
4       I
       ..
4172    F
4173    M
4174    M
4175    F
4176    M
Name: Sex, Length: 4177, dtype: object


TypeError: '(slice(None, None, None), 0)' is an invalid key

## <<< MACHINE LEARNING >>>

### 1 - Principle Component Analysis (PCA)

In [None]:
%%time
X_pca = decomposition.TruncatedSVD(n_components = 2).fit_transform(X)

# Attach the label for each 2-d data point
X_pca = np.vstack((X_pca.T, Y)).T

# Start time for processing
start_time = time.time()

dataset_pca = pd.DataFrame(X_pca, columns = ['First_Component',
                                             'Second_Component',
                                             'Churn'])
sns.FacetGrid(data = dataset_pca, hue = 'Churn', height = 6)\
   .map(plt.scatter, 'First_Component', 'Second_Component')\
   .add_legend()

print("--- Learning time : %s seconds ---\n" % (time.time() - start_time))