In [1]:
import pandas as pd
from io import StringIO
import sys


csv_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,'''

df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [2]:
df.isnull().sum()
#isnull() method return a dataframe with Boolean values that
#indicate whether a cell contains a numeric value(False) or if the data is missing (True)
#Using sum method, we can then return the number of missing values per column


A    0
B    0
C    1
D    1
dtype: int64

In [3]:
#access the underlying NumPy array via the 'values' attribute
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [4]:
#Eliminating samples or features with missing values

#remove rows that contain missing values

df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [5]:
#remove columns that contain missing values
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0


In [6]:
#only drop rows where all columns are NaN
df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [7]:
#drop rows that have less than 3 real values
df.dropna(thresh=4)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0


In [8]:
#only drop rows where NaN appear in specific columns (here: 'C')
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [9]:
#Imputing missing values
df.values

array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6., nan,  8.],
       [10., 11., 12., nan]])

In [10]:
#impute missing values via the column mean
from sklearn.preprocessing import Imputer

imr = Imputer(missing_values='NaN',strategy='mean',axis=0) #axis = 1 row mean
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)

imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [11]:
#Handling categorical data
#nominal and ordinal features

import pandas as pd

df = pd.DataFrame([['green','M',10.1,'class1'],
                   ['red','L',13.5,'class2'],
                   ['blue','XL',15.3,'class1']])
df.columns = ['color','size','price','classlabel']

df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [12]:
#Mapping ordinal features
#ordinal features can be understood as categorical values that can be sorted or ordered

#no convenient function, have to define manully

size_mapping={'XL':3,
              'L':2,
              'M':1}

df['size']=df['size'].map(size_mapping)
df



Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [13]:
#transform the integer back to the original string representation
#define a reverse-mapping dictionary

inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)

0     M
1     L
2    XL
Name: size, dtype: object

In [15]:
#Encoding class labels

import numpy as np

#create a mapping dict
#to convert class labels from strings to integers
#remember that class labels are not ordinal, and it does not matter which integer number we assign to a particular string label


class_mapping ={label: idx for idx, label in 
                enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [16]:
#to convert class labels from strings to integers
df['classlabel']=df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,0
1,red,2,13.5,1
2,blue,3,15.3,0


In [17]:
#reverse the class label mapping
class_mapping.items()

dict_items([('class1', 0), ('class2', 1)])

In [18]:
inv_class_mapping = {v:k for k, v in class_mapping.items()}
df['classlabel']=df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [23]:
#alternatively, there is a convenient LabelEncoder class directly implemented in scikit-learn to achieve this:
#fit_transform method is just a shortcut for calling fit and transform separately

from sklearn.preprocessing import LabelEncoder
class_le = LabelEncoder()
y = class_le.fit_transform(df['classlabel'].values)

y

array([0, 1, 0])

In [24]:
#we can use the inverse_transform method to transform the integer class labels back into their original string representation

class_le.inverse_transform(y)


  if diff:


array(['class1', 'class2', 'class1'], dtype=object)

In [25]:
#performing one-hot encoding on nominal features

X = df[['color','size','price']].values

color_le = LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [27]:
#however, although the color values don't come in any particular order,
#a learning algorithm will now assume that 'green' is larger than 'blue'
#it might be misleading and the output may not be the optimal
#a workaround for this problem is to use a technique called one-hot encoding
#the idea behind this approach is to create a new dummy feature for each unique value in the nominal feature column

#here we will convert the color feature into three new features: blue, green, and red
#perform this transformation, we can use OneHotEncoder that is implemented in the scikit-learn.preprocessing module

from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder(categorical_features=[0])

ohe.fit_transform(X).toarray()

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [28]:
#By default, the OneHotEncoder returns a sparse matrix when we use the transform method
#and we converted the sparse matrix representation into a regular (dense) NumPy array for the purpose of visualization via the toarray method
#Sparse matrices are a more efficient way of storing large datasets and one that is supported by many scikit-learn functions
#which is especially useful if an array contains a lot of zeros
#Alternatively, we can initialze the encoder as OneHotEncoder(..., sparse = False) to return a regular NumPy array

ohe = OneHotEncoder(categorical_features=[0],sparse=False)
ohe.fit_transform(X)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [29]:
#we can also create dummy features via one-hot encoding 
#using the get_dummies method implemented in pandas
#it only convert string columns and leave all other columns unchanged

pd.get_dummies(df[['price','color','size']])

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [30]:
#using one-hot encoding will introduce multicollinearity, 
#which is bad for any method that requires matrix inversion
#solution is to remove one feature column from the one-hot encoded array

#multicollinearity guard in get_dummies

pd.get_dummies(df[['price','color','size']],drop_first=True)


Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


In [31]:
#multicollinearity guard for the OneHotEncoder
#manually

ohe = OneHotEncoder(categorical_features=[0])
ohe.fit_transform(X).toarray()[:,1:]



array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

In [40]:
#Partitioning a dataset into separate training and test sets
#we will prepare a new dataset, the Wine dataset.
#After we preprocess the dataset, we will explore different techniques for feature selection to reduce the dimensionality

#usig the pandas library, we will directly read in he open source Wine dataset from the UCI machine learning repository

df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                      'ml/machine-learning-databases/'
                      'wine/wine.data',header=None)

df_wine.columns = ['Class label', 'Alcohol', 'Malic acid', 'Ash',
                   'Alcalinity of ash', 'Magnesium', 'Total phenols',
                   'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins',
                   'Color intensity', 'Hue', 'OD280/OD315 of diluted wines',
                   'Proline']
print('Class labels',np.unique(df_wine['Class label']))
df_wine.head()

Class labels [1 2 3]


Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [43]:
#The samples belong to one of three different classes, 1,2,3, referring to three different types of grape grown in the same region in Italy but derived from different wine cultivars

#A convenient way to randomly partition this dataset into separate test and training datasets is to use the train_test_split function from scikit-learn's model_selection submodule

#30% test data, 70% training data, stratify ensures that both training and test datasets have the same class proportions as the original dataset

from sklearn.model_selection import train_test_split

X,y = df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values

X_train, X_test, y_train, y_test = \
    train_test_split(X,y,test_size=0.3,random_state=0,stratify=y)

In [None]:
#Bringing feat