### Data processing
- Data errors exploration
    - categorical outliers
    - numerical outliers
- Data transformation: 
    - Numerical data normalization
    - Text: tokenization, stemming and lemmatization
- Feature engineering: 
    - Numerical
    - Categorical
    - Text


### Data transformation

#### numerical data normalization

- Minmax scaler

In [1]:
import sklearn
from sklearn.preprocessing import MinMaxScaler

In [2]:
data = [[-1, 2], 
        [-0.5, 6], 
        [0, 10], 
        [1, 18]]
scaler = MinMaxScaler() # feature_range=(0,1) by default

In [3]:
scaler.fit(data)

In [4]:
scaler.data_min_, scaler.data_max_

(array([-1.,  2.]), array([ 1., 18.]))

In [7]:
# transform data from original range to [0,1]
scaler.transform(data)

array([[0.  , 0.  ],
       [0.25, 0.25],
       [0.5 , 0.5 ],
       [1.  , 1.  ]])

In [8]:
scaler.transform([[2, 2]])

array([[1.5, 0. ]])

- Self-practice with StandardScaler 
    - https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
   

In [9]:
# I installed all package successfully until here

#### Data transformation: text data tokenization, stemming/lemmatization

In [17]:
import nltk
from nltk import sent_tokenize
from nltk import word_tokenize
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /Users/qxlin/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/qxlin/nltk_data...


True

In [13]:
nltk.sent_tokenize("At eight o'clock on Thursday morning. Arthur didn't feel very good.")

["At eight o'clock on Thursday morning.", "Arthur didn't feel very good."]

In [14]:
nltk.word_tokenize("At eight o'clock on Thursday morning. Arthur didn't feel very good.")

['At',
 'eight',
 "o'clock",
 'on',
 'Thursday',
 'morning',
 '.',
 'Arthur',
 'did',
 "n't",
 'feel',
 'very',
 'good',
 '.']

In [15]:
from nltk.stem import PorterStemmer
ps = PorterStemmer()
word = ("leaves") 
ps.stem(word)

'leav'

In [18]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()
lemmatizer.lemmatize("leaves")

'leaf'

### Feature engineering

#### Feature engineering: categorical data 

In [19]:
from sklearn.preprocessing import OneHotEncoder

In [20]:
oh_enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male'], 
     ['Female'], 
     ['Female']]

In [21]:
oh_enc.fit(X)

In [22]:
oh_enc.transform(X).toarray()

array([[0., 1.],
       [1., 0.],
       [1., 0.]])

In [23]:
oh_enc.categories_

[array(['Female', 'Male'], dtype=object)]

In [24]:
oh_enc.transform([['Female'], ['Male']]).toarray()

array([[1., 0.],
       [0., 1.]])

In [25]:
oh_enc.get_feature_names_out(['gender'])

array(['gender_Female', 'gender_Male'], dtype=object)

In [26]:
oh_enc.inverse_transform([[0, 1], [1, 0]])

array([['Male'],
       ['Female']], dtype=object)

- Ordinal encoder

In [27]:
from sklearn.preprocessing import OrdinalEncoder
od_enc = OrdinalEncoder()

In [28]:
X = [['Male', 1], 
     ['Female', 3], 
     ['Female', 2]]

In [29]:
od_enc.fit(X)

In [30]:
od_enc.categories_

[array(['Female', 'Male'], dtype=object), array([1, 2, 3], dtype=object)]

In [31]:
import numpy as np
np.array(X)

array([['Male', '1'],
       ['Female', '3'],
       ['Female', '2']], dtype='<U21')

In [32]:
od_enc.fit_transform(X)

array([[1., 0.],
       [0., 2.],
       [0., 1.]])

In [33]:
od_enc.transform([['Female', 3], ['Male', 1]])

array([[0., 2.],
       [1., 0.]])

#### Feature engineering: text data

- CountVectorizer
    - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html

In [34]:
from sklearn.feature_extraction.text import CountVectorizer

In [35]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [36]:
vectorizer = CountVectorizer()

In [37]:
X = vectorizer.fit_transform(corpus)
# explore other parameter settings: lowercase, stop_words, ngram_range(), max_df, min_df, binary

In [38]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [39]:
print(X.toarray())

[[0 1 1 1 0 0 1 0 1]
 [0 2 0 1 0 1 1 0 1]
 [1 0 0 1 1 0 1 1 1]
 [0 1 1 1 0 0 1 0 1]]


In [40]:
X.shape

(4, 9)

- TfidfVectorizer
    - https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    

In [41]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [42]:
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

In [43]:
vectorizer = TfidfVectorizer()

- self practice: explore other parameter settings in TfidfVectorizer: 
    - E.g., lowercase, stop_words, ngram_range(), max_df, min_df, binary, use_idf, smooth_idf

In [44]:
X = vectorizer.fit_transform(corpus)

In [45]:
vectorizer.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [46]:
print(X.shape)

(4, 9)


In [47]:
X.toarray()

array([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524],
       [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
        0.53864762, 0.28108867, 0.        , 0.28108867],
       [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
        0.        , 0.26710379, 0.51184851, 0.26710379],
       [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
        0.        , 0.38408524, 0.        , 0.38408524]])

- Word embedding

In [48]:
import gensim.downloader as api
wv = api.load('word2vec-google-news-300')



In [52]:
np.round(wv['king'] - wv['man'],2)

array([-0.2 , -0.1 , -0.03,  0.22, -0.12,  0.01,  0.31, -0.21, -0.09,
        0.36, -0.27, -0.05, -0.09,  0.13, -0.07, -0.21,  0.13, -0.05,
        0.02,  0.23,  0.27,  0.11,  0.03,  0.41,  0.04,  0.01, -0.17,
       -0.2 , -0.03,  0.01, -0.06, -0.07,  0.19,  0.42, -0.23, -0.17,
       -0.21,  0.13, -0.  , -0.15,  0.11, -0.19,  0.07,  0.15,  0.12,
        0.11, -0.03,  0.02, -0.  , -0.02, -0.03,  0.03, -0.32,  0.2 ,
       -0.24,  0.09, -0.07, -0.01, -0.04, -0.01,  0.01,  0.14,  0.05,
        0.15,  0.01, -0.18, -0.07, -0.02, -0.25,  0.31,  0.04, -0.09,
        0.1 ,  0.05,  0.09, -0.11, -0.12,  0.11,  0.15, -0.08, -0.16,
        0.18, -0.11,  0.17,  0.28, -0.06, -0.01,  0.03, -0.02,  0.09,
        0.38, -0.11, -0.19, -0.14, -0.09,  0.06,  0.11, -0.07,  0.04,
       -0.08,  0.17,  0.21, -0.13, -0.24, -0.51,  0.31, -0.64, -0.01,
        0.14,  0.13,  0.24,  0.22,  0.01, -0.12,  0.33, -0.23,  0.08,
       -0.18,  0.12,  0.07,  0.12,  0.13,  0.06, -0.06,  0.19, -0.02,
        0.05, -0.02,

In [53]:
np.round(wv['queen'] - wv['woman'], 2)

array([-0.24, -0.07,  0.03,  0.23,  0.01,  0.02,  0.04, -0.24, -0.31,
        0.01,  0.02, -0.04,  0.17,  0.12,  0.16,  0.02,  0.26,  0.01,
        0.  ,  0.12,  0.22,  0.18, -0.18,  0.29,  0.06,  0.18, -0.14,
       -0.17, -0.16,  0.04, -0.3 , -0.12,  0.14,  0.32, -0.24, -0.25,
       -0.21,  0.31,  0.  , -0.02,  0.26, -0.39,  0.11, -0.  ,  0.05,
       -0.02,  0.07,  0.11,  0.19,  0.12, -0.07,  0.1 , -0.  , -0.07,
       -0.09,  0.01, -0.25, -0.1 , -0.09, -0.11, -0.16,  0.1 , -0.  ,
        0.03,  0.13, -0.3 , -0.11, -0.03, -0.38,  0.19, -0.02, -0.  ,
        0.03,  0.02, -0.08, -0.1 ,  0.11,  0.06, -0.06, -0.21, -0.31,
        0.12,  0.1 ,  0.04,  0.52, -0.11, -0.09,  0.06,  0.08,  0.14,
        0.59, -0.03, -0.03, -0.04,  0.07,  0.03,  0.01, -0.08, -0.03,
        0.03,  0.15,  0.13,  0.13, -0.03, -0.62,  0.22, -0.65,  0.11,
       -0.12,  0.04,  0.26,  0.1 , -0.03,  0.07,  0.36,  0.09,  0.25,
       -0.16,  0.12, -0.07,  0.04, -0.14,  0.19, -0.19,  0.24, -0.14,
       -0.14,  0.13,

In [54]:
sklearn.metrics.pairwise.cosine_similarity([wv['king'] - wv['man']], [wv['queen'] - wv['woman']])

array([[0.7580352]], dtype=float32)