In [1]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import numpy as np
import pandas as pd
np.set_printoptions(suppress=True)

# Load sample data of video views

In [2]:
views = pd.DataFrame([1295., 25., 19000., 5., 1., 300.], columns=['views'])
views

Unnamed: 0,views
0,1295.0
1,25.0
2,19000.0
3,5.0
4,1.0
5,300.0


# Standard Scaler $\frac{x_i - \mu}{\sigma}$

In [3]:
ss = StandardScaler()
views['zscore'] = ss.fit_transform(views[['views']])
views

Unnamed: 0,views,zscore
0,1295.0,-0.307214
1,25.0,-0.489306
2,19000.0,2.231317
3,5.0,-0.492173
4,1.0,-0.492747
5,300.0,-0.449877


In [4]:
vw = np.array(views['views'])
(vw[0] - np.mean(vw)) / np.std(vw)

-0.30721413311687235

# Min-Max Scaler $\frac{x_i - min(x)}{max(x) - min(x)}$

In [5]:
mms = MinMaxScaler()
views['minmax'] = mms.fit_transform(views[['views']])
views

Unnamed: 0,views,zscore,minmax
0,1295.0,-0.307214,0.068109
1,25.0,-0.489306,0.001263
2,19000.0,2.231317,1.0
3,5.0,-0.492173,0.000211
4,1.0,-0.492747,0.0
5,300.0,-0.449877,0.015738


In [6]:
(vw[0] - np.min(vw)) / (np.max(vw) - np.min(vw))

0.06810884783409653

# Robust Scaler $\frac{x_i - median(x)}{IQR_{(1,3)}(x)}$

In [7]:
rs = RobustScaler()
views['robust'] = rs.fit_transform(views[['views']])
views

Unnamed: 0,views,zscore,minmax,robust
0,1295.0,-0.307214,0.068109,1.092883
1,25.0,-0.489306,0.001263,-0.13269
2,19000.0,2.231317,1.0,18.178528
3,5.0,-0.492173,0.000211,-0.15199
4,1.0,-0.492747,0.0,-0.15585
5,300.0,-0.449877,0.015738,0.13269


In [8]:
quartiles = np.percentile(vw, (25., 75.))
iqr = quartiles[1] - quartiles[0]
(vw[0] - np.median(vw)) / iqr

1.0928829915560916

# Import necessary dependencies and settings(Feature Engineering)

In [9]:
import pandas as pd
import numpy as np
import re
import nltk

# Sample corpus of text documents

In [10]:
corpus = ['The sky is blue and beautiful.',
          'Love this blue and beautiful sky!',
          'The quick brown fox jumps over the lazy dog.',
          'The brown fox is quick and the blue dog is lazy!',
          'The sky is very blue and the sky is very beautiful today',
          'The dog is lazy but the brown fox is quick!'    
]
labels = ['weather', 'weather', 'animals', 'animals', 'weather', 'animals']
corpus = np.array(corpus)
corpus_df = pd.DataFrame({'Document': corpus, 
                          'Category': labels})
corpus_df = corpus_df[['Document', 'Category']]
corpus_df

Unnamed: 0,Document,Category
0,The sky is blue and beautiful.,weather
1,Love this blue and beautiful sky!,weather
2,The quick brown fox jumps over the lazy dog.,animals
3,The brown fox is quick and the blue dog is lazy!,animals
4,The sky is very blue and the sky is very beaut...,weather
5,The dog is lazy but the brown fox is quick!,animals


# Simple text pre-processing

In [11]:
wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')

def normalize_document(doc):
    # lower case and remove special characters\whitespaces
    doc = re.sub(r'[^a-zA-Z0-9\s]', '', doc, re.I)
    doc = doc.lower()
    doc = doc.strip()
    # tokenize document
    tokens = wpt.tokenize(doc)
    # filter stopwords out of document
    filtered_tokens = [token for token in tokens if token not in stop_words]
    # re-create document from filtered tokens
    doc = ' '.join(filtered_tokens)
    return doc

normalize_corpus = np.vectorize(normalize_document)

LookupError: 
**********************************************************************
  Resource [93mstopwords[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('stopwords')
  [0m
  Attempted to load [93mcorpora/stopwords[0m

  Searched in:
    - 'C:\\Users\\sanchita/nltk_data'
    - 'C:\\Users\\sanchita\\Anaconda3\\nltk_data'
    - 'C:\\Users\\sanchita\\Anaconda3\\share\\nltk_data'
    - 'C:\\Users\\sanchita\\Anaconda3\\lib\\nltk_data'
    - 'C:\\Users\\sanchita\\AppData\\Roaming\\nltk_data'
    - 'C:\\nltk_data'
    - 'D:\\nltk_data'
    - 'E:\\nltk_data'
**********************************************************************


In [12]:
norm_corpus = normalize_corpus(corpus)
norm_corpus

NameError: name 'normalize_corpus' is not defined

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(min_df=0., max_df=1.)
cv_matrix = cv.fit_transform(norm_corpus)
cv_matrix = cv_matrix.toarray()
cv_matrix

NameError: name 'norm_corpus' is not defined