In [69]:
import pandas as pd
from io import StringIO
from sklearn import preprocessing
from sklearn.compose import ColumnTransformer


In [26]:
cvs_data = \
'''A,B,C,D
1.0,2.0,3.0,4.0
5.0,6.0,,8.0
10.0,11.0,12.0,
7.0,4.0,9.0,8.0'''
df = pd.read_csv(StringIO(cvs_data))
df


Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,7.0,4.0,9.0,8.0


In [27]:
'''this gives us the number of missing values in each column'''
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [28]:
df.dropna(axis=0)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
3,7.0,4.0,9.0,8.0


In [29]:
df.dropna(axis=1)

Unnamed: 0,A,B
0,1.0,2.0
1,5.0,6.0
2,10.0,11.0
3,7.0,4.0


In [30]:
'''there are some additional parameters for dropna like droping the rows or columns only if all the row/ columns
have na'''
df.dropna(how="all")

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,7.0,4.0,9.0,8.0


In [31]:
'''or you can have a threshold for how many less real value to drop'''
df.dropna(axis=1,thresh=3)

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,
3,7.0,4.0,9.0,8.0


In [32]:
'''only drop the rows if the missing data is in specific columns'''
df.dropna(subset=["D"])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
3,7.0,4.0,9.0,8.0


In [33]:
'''since sometimes we don't want to get rid of too many features or too many observations, we need to introduce 
some interpolation methods. like mean imputation aka replace the miss entries with the mean of entire feature column
we use sklearn to do that scikit learn'''
from sklearn.impute import SimpleImputer
import numpy as np
imputer = SimpleImputer(missing_values=np.nan,strategy='median')
imputer_data =imputer.fit_transform(df.values)
imputer_data



array([[ 1.,  2.,  3.,  4.],
       [ 5.,  6.,  9.,  8.],
       [10., 11., 12.,  8.],
       [ 7.,  4.,  9.,  8.]])

In [34]:
""" this is the example dataset """
import pandas as pd 
df = pd.DataFrame([
    ['green','M',10.1,'class1'],
    ['red','L',13.5,'class2'],
    ['blue','XL',15.3,'class1']
])
df.columns = ['color','size','prize','classlabel']
df


Unnamed: 0,color,size,prize,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [35]:
size_mapping = {'XL':3,
               'L':2,
               'M':1}
invert_size_mapping = {v:k for k,v in size_mapping.items() }

df['size'] = df['size'].map(size_mapping)

df

Unnamed: 0,color,size,prize,classlabel
0,green,1,10.1,class1
1,red,2,13.5,class2
2,blue,3,15.3,class1


In [36]:
def invertMapping(mapping):
    mapping = {v:k for k,v in mapping.items()}
    return mapping
def mapping(panda_series):
    mapping = {k:v for v,k in enumerate(np.unique(panda_series))}
    return mapping

In [37]:

df['size']=df['size'].map(invert_size_mapping)
df

Unnamed: 0,color,size,prize,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [38]:
'''when we introduce numpy, then we can use the np enumerate function'''

classlabel_mapping = {label:idx for idx,label in enumerate(np.unique(df['classlabel']))}
classlabel_mapping

{'class1': 0, 'class2': 1}

In [39]:
df['classlabel']=df['classlabel'].map(classlabel_mapping)
df

Unnamed: 0,color,size,prize,classlabel
0,green,M,10.1,0
1,red,L,13.5,1
2,blue,XL,15.3,0


In [40]:
df['classlabel']=df['classlabel'].map(invertMapping(classlabel_mapping))
df

Unnamed: 0,color,size,prize,classlabel
0,green,M,10.1,class1
1,red,L,13.5,class2
2,blue,XL,15.3,class1


In [48]:
X = df[['color','prize','prize']].values
X


array([['green', 10.1, 10.1],
       ['red', 13.5, 13.5],
       ['blue', 15.3, 15.3]], dtype=object)

In [43]:
color_le = preprocessing.LabelEncoder()
X[:,0] = color_le.fit_transform(X[:,0])
X

array([[1, 10.1, 10.1],
       [2, 13.5, 13.5],
       [0, 15.3, 15.3]], dtype=object)

In [78]:
'''the problem with this LabelEncoder is that it automatically thinks 
blue is smallest and red is largest even though it is not ordinal. In order
to deal with this, we have to introduce one hot encoder'''


X


array([['green', 10.1, 10.1],
       ['red', 13.5, 13.5],
       ['blue', 15.3, 15.3]], dtype=object)

In [97]:
data = pd.get_dummies(df[['prize','color','size']]).values

In [96]:
data

Unnamed: 0,prize,color_blue,color_green,color_red,size_L,size_M,size_XL
0,10.1,0,1,0,0,1,0
1,13.5,0,0,1,1,0,0
2,15.3,1,0,0,0,0,1


In [98]:
df_wine = pd.read_csv('https://archive.ics.uci.edu/'
                             'ml/machine-learning-databases/'
                             'wine/wine.data', header=None)

In [102]:
df_wine.columns=['Class label', 'Alcohol','Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium','Total phenols', 'Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity', 'Hue','OD280/OD315 of diluted wines','Proline']

In [104]:
df_wine.head()

Unnamed: 0,Class label,Alcohol,Malic acid,Ash,Alcalinity of ash,Magnesium,Total phenols,Flavanoids,Nonflavanoid phenols,Proanthocyanins,Color intensity,Hue,OD280/OD315 of diluted wines,Proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


In [107]:
from sklearn.model_selection import train_test_split
X,y = df_wine.iloc[:,1:].values,df_wine.iloc[:,0].values


In [115]:
X
X.shape

(178, 13)

In [110]:
train_X,test_X,train_y,test_y = train_test_split(X,y,test_size=0.3,random_state=1,stratify=y)

In [116]:
train_X.shape
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
sd = StandardScaler()
mm = MinMaxScaler()
train_X_std = sd.fit_transform(train_X)
test_X_std = sd.fit_transform(test_X)
train_X_norm = mm.fit_transform(train_X)
test_X_norm = mm.fit_transform(test_X)
test_X_std

array([[ 5.08577188e-01,  1.03651802e+00,  1.34337248e+00,
         1.43887129e+00, -2.50026476e-01, -1.25320251e+00,
        -1.58657165e+00,  1.40189247e+00, -1.69424012e+00,
        -3.25611341e-01, -3.15520387e-01, -7.94492150e-01,
        -8.00019557e-01],
       [-6.61035048e-15, -2.09702278e-01,  1.45350727e+00,
         2.08934471e+00,  9.16006090e-01, -1.92242353e-01,
        -7.85674966e-01, -7.45687483e-01, -1.63055613e-02,
         8.98522466e-01, -1.54825120e+00, -1.84334950e+00,
        -1.11496855e+00],
       [ 1.30245377e+00,  3.88875975e-01,  4.62294173e-01,
         3.00542811e-01, -6.18247287e-01, -1.04101048e+00,
        -1.39688560e+00,  8.64997480e-01, -5.47928391e-01,
         2.05588534e+00, -1.50422510e+00, -1.29736896e+00,
        -3.35884199e-01],
       [-1.57534885e+00, -1.20079086e+00,  8.29410133e-01,
        -3.49930607e-01, -4.34136882e-01, -1.92242353e-01,
         1.94369664e-01, -1.10361747e+00,  1.29613830e+00,
        -9.04292777e-01, -7.55781393e

In [117]:
test_X_norm

array([[0.56622517, 0.46435845, 0.67105263, 0.86013986, 0.22619048,
        0.16877637, 0.0152439 , 0.78723404, 0.04545455, 0.21363636,
        0.3908046 , 0.29811321, 0.17333333],
       [0.43046358, 0.20570265, 0.69078947, 1.        , 0.45238095,
        0.44303797, 0.24695122, 0.27659574, 0.3986014 , 0.46363636,
        0.06896552, 0.02264151, 0.08888889],
       [0.7781457 , 0.3299389 , 0.51315789, 0.61538462, 0.1547619 ,
        0.22362869, 0.07012195, 0.65957447, 0.28671329, 0.7       ,
        0.08045977, 0.16603774, 0.29777778],
       [0.00993377, 0.        , 0.57894737, 0.47552448, 0.19047619,
        0.44303797, 0.5304878 , 0.19148936, 0.67482517, 0.09545455,
        0.27586207, 0.68301887, 0.17333333],
       [0.1589404 , 0.46435845, 0.33552632, 0.47552448, 0.11904762,
        0.71729958, 0.76829268, 0.68085106, 0.83566434, 0.02727273,
        1.        , 0.58867925, 0.072     ],
       [0.63245033, 0.10997963, 0.52631579, 0.38461538, 0.35714286,
        0.94936709, 0.97560

In [123]:
dataframe = pd.DataFrame([[1,3,4],
               [4,6,8]])
dataframe

Unnamed: 0,0,1,2
0,1,3,4
1,4,6,8


In [138]:
dataframe.columns=['age','diseases','score']
arr= dataframe.iloc[1:,1].values

arr

array([6])