# Data Munging

## Data loading and preprocessing with pandas

### Fast and easy data loading

In [1]:
import pandas as pd
iris_filename = 'datasets-uci-iris.csv'
iris = pd.read_csv(iris_filename, sep=',', decimal='.', header=None,
        names= ['sepal_length', 'sepal_width', 'petal_length', 'petal_width','target'])

In [2]:
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
iris.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [4]:
iris.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,target
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa


In [5]:
iris.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'target'], dtype='object')

In [6]:
Y= iris['target']
Y

0         Iris-setosa
1         Iris-setosa
2         Iris-setosa
3         Iris-setosa
4         Iris-setosa
5         Iris-setosa
6         Iris-setosa
7         Iris-setosa
8         Iris-setosa
9         Iris-setosa
10        Iris-setosa
11        Iris-setosa
12        Iris-setosa
13        Iris-setosa
14        Iris-setosa
15        Iris-setosa
16        Iris-setosa
17        Iris-setosa
18        Iris-setosa
19        Iris-setosa
20        Iris-setosa
21        Iris-setosa
22        Iris-setosa
23        Iris-setosa
24        Iris-setosa
25        Iris-setosa
26        Iris-setosa
27        Iris-setosa
28        Iris-setosa
29        Iris-setosa
            ...      
120    Iris-virginica
121    Iris-virginica
122    Iris-virginica
123    Iris-virginica
124    Iris-virginica
125    Iris-virginica
126    Iris-virginica
127    Iris-virginica
128    Iris-virginica
129    Iris-virginica
130    Iris-virginica
131    Iris-virginica
132    Iris-virginica
133    Iris-virginica
134    Iri

In [7]:
X= iris[['sepal_length','sepal_width']]
X

Unnamed: 0,sepal_length,sepal_width
0,5.1,3.5
1,4.9,3.0
2,4.7,3.2
3,4.6,3.1
4,5.0,3.6
5,5.4,3.9
6,4.6,3.4
7,5.0,3.4
8,4.4,2.9
9,4.9,3.1


In [8]:
print(X.shape)

(150, 2)


In [9]:
print(Y.shape)

(150,)


### Dealing with problematic data

In [10]:
import pandas as pd
fake_dataset = pd.read_csv('a_loading_example_1.csv', sep=',')
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,20140910,80.0,32.0,40,1
1,20140911,100.0,50.0,36,2
2,20140912,102.0,55.0,46,1
3,20140913,60.0,20.0,35,3
4,20140914,60.0,,32,3
5,20140915,,57.0,42,2


In [11]:
fake_dataset = pd.read_csv('a_loading_example_1.csv', parse_dates=[0])
fake_dataset

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,,32,3
5,2014-09-15,,57.0,42,2


In [12]:
fake_dataset.fillna(50)

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,50.0,32,3
5,2014-09-15,50.0,57.0,42,2


In [13]:
fake_dataset.fillna(-1)

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,-1.0,32,3
5,2014-09-15,-1.0,57.0,42,2


In [14]:
fake_dataset.fillna(fake_dataset.mean(axis=0))

Unnamed: 0,Date,Temperature_city_1,Temperature_city_2,Temperature_city_3,Which_destination
0,2014-09-10,80.0,32.0,40,1
1,2014-09-11,100.0,50.0,36,2
2,2014-09-12,102.0,55.0,46,1
3,2014-09-13,60.0,20.0,35,3
4,2014-09-14,60.0,42.8,32,3
5,2014-09-15,80.4,57.0,42,2


In [15]:
bad_dataset = pd.read_csv('a_loading_example_2.csv', error_bad_lines=False)
bad_dataset

b'Skipping line 4: expected 3 fields, saw 4\n'


Unnamed: 0,Val1,Val2,Val3
0,0,0,0
1,1,1,1
2,3,3,3


### Dealing with big datasets

In [16]:
import pandas as pd
iris_chunks = pd.read_csv(iris_filename, header=None,
              names=['C1', 'C2', 'C3', 'C4', 'C5'], chunksize=10)
for chunk in iris_chunks:
    print ('Shape:', chunk.shape)
    print (chunk,'\n')

Shape: (10, 5)
    C1   C2   C3   C4           C5
0  5.1  3.5  1.4  0.2  Iris-setosa
1  4.9  3.0  1.4  0.2  Iris-setosa
2  4.7  3.2  1.3  0.2  Iris-setosa
3  4.6  3.1  1.5  0.2  Iris-setosa
4  5.0  3.6  1.4  0.2  Iris-setosa
5  5.4  3.9  1.7  0.4  Iris-setosa
6  4.6  3.4  1.4  0.3  Iris-setosa
7  5.0  3.4  1.5  0.2  Iris-setosa
8  4.4  2.9  1.4  0.2  Iris-setosa
9  4.9  3.1  1.5  0.1  Iris-setosa 

Shape: (10, 5)
     C1   C2   C3   C4           C5
10  5.4  3.7  1.5  0.2  Iris-setosa
11  4.8  3.4  1.6  0.2  Iris-setosa
12  4.8  3.0  1.4  0.1  Iris-setosa
13  4.3  3.0  1.1  0.1  Iris-setosa
14  5.8  4.0  1.2  0.2  Iris-setosa
15  5.7  4.4  1.5  0.4  Iris-setosa
16  5.4  3.9  1.3  0.4  Iris-setosa
17  5.1  3.5  1.4  0.3  Iris-setosa
18  5.7  3.8  1.7  0.3  Iris-setosa
19  5.1  3.8  1.5  0.3  Iris-setosa 

Shape: (10, 5)
     C1   C2   C3   C4           C5
20  5.4  3.4  1.7  0.2  Iris-setosa
21  5.1  3.7  1.5  0.4  Iris-setosa
22  4.6  3.6  1.0  0.2  Iris-setosa
23  5.1  3.3  1.7  0.5  Ir

In [17]:
iris_iterator = pd.read_csv(iris_filename, header=None,
                names=['C1', 'C2', 'C3', 'C4', 'C5'], iterator=True)

In [18]:
print (iris_iterator.get_chunk(10).shape)

(10, 5)


In [19]:
print (iris_iterator.get_chunk(20).shape)

(20, 5)


In [20]:
piece = iris_iterator.get_chunk(2)
piece

Unnamed: 0,C1,C2,C3,C4,C5
30,4.8,3.1,1.6,0.2,Iris-setosa
31,5.4,3.4,1.5,0.4,Iris-setosa


In [21]:
import csv

In [22]:
with open(iris_filename, 'rt') as data_stream:
    # 'rt' mode
    for n, row in enumerate(csv.DictReader(data_stream,
        fieldnames = ['sepal_length', 'sepal_width','petal_length', 'petal_width', 'target'],
        dialect='excel')):
            if n== 0:
                print (n,row)
            else:
                break

0 OrderedDict([('sepal_length', '5.1'), ('sepal_width', '3.5'), ('petal_length', '1.4'), ('petal_width', '0.2'), ('target', 'Iris-setosa')])


In [23]:
with open(iris_filename, 'rt') as data_stream:
    for n, row in enumerate(csv.reader(data_stream,dialect='excel')):
            if n==0:
                print (row)
            else:
                break

['5.1', '3.5', '1.4', '0.2', 'Iris-setosa']


In [24]:
def batch_read(filename, batch=5):
    with open(filename, 'rt') as data_stream:
        batch_output = list()
        for n, row in enumerate(csv.reader(data_stream, dialect='excel')):
            if n > 0 and n % batch == 0:
                yield(np.array(batch_output))
                batch_output = list()
            batch_output.append(row)
        yield(np.array(batch_output))

In [25]:
import numpy as np
for batch_input in batch_read(iris_filename, batch=3):
    print (batch_input)
    break

[['5.1' '3.5' '1.4' '0.2' 'Iris-setosa']
 ['4.9' '3.0' '1.4' '0.2' 'Iris-setosa']
 ['4.7' '3.2' '1.3' '0.2' 'Iris-setosa']]


### Accessing other data formats

In [2]:
import pandas as pd
my_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':[1.0]*5,
                               'Col3': 1.0, 'Col4': 'Hello World!'})
my_own_dataset

Unnamed: 0,Col1,Col2,Col3,Col4
0,0,1.0,1.0,Hello World!
1,1,1.0,1.0,Hello World!
2,2,1.0,1.0,Hello World!
3,3,1.0,1.0,Hello World!
4,4,1.0,1.0,Hello World!


In [4]:
my_wrong_own_dataset = pd.DataFrame({'Col1': range(5), 'Col2':'string', 'Col3': range(2)})

ValueError: arrays must all be same length

In [5]:
my_own_dataset.dtypes

Col1      int64
Col2    float64
Col3    float64
Col4     object
dtype: object

In [6]:
my_own_dataset['Col1'] = my_own_dataset['Col1'].astype(float)
my_own_dataset.dtypes

Col1    float64
Col2    float64
Col3    float64
Col4     object
dtype: object

### Data preprocessing

In [None]:
mask_feature = iris['sepal_length'] > 6.0
mask_feature

In [None]:
mask_target = iris['target'] == 'Iris-virginica'
iris.loc[mask_target, 'target'] = 'New label'

In [None]:
iris['target'].unique()

In [None]:
grouped_targets_mean = iris.groupby(['target']).mean()
grouped_targets_mean

In [None]:
grouped_targets_var = iris.groupby(['target']).var()
grouped_targets_var

In [None]:
iris.sort_values(by='sepal_length').head()

In [None]:
# This is just an example, with no time_series data
# smooth_time_series = pd.rolling_mean(time_series, 5)

In [None]:
# This is just an example, with no time_series data
# median_time_series = pd.rolling_median(time_series, 5)

In [None]:
iris.apply(np.count_nonzero, axis=1).head()

In [None]:
iris.apply(np.count_nonzero, axis=0)

In [None]:
iris.applymap(lambda el:len(str(el))).head()

### Data selection

In [28]:
dataset = pd.read_csv('a_selection_example_1.csv')
dataset

Unnamed: 0,n,val1,val2,val3
0,100,10,10,C
1,101,10,20,C
2,102,10,30,B
3,103,10,40,B
4,104,10,50,A


In [29]:
dataset = pd.read_csv('a_selection_example_1.csv',index_col=0)
dataset

Unnamed: 0_level_0,val1,val2,val3
n,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
100,10,10,C
101,10,20,C
102,10,30,B
103,10,40,B
104,10,50,A


In [30]:
dataset['val3'][104]

'A'

In [31]:
dataset.loc[104, 'val3']

'A'

In [32]:
dataset.ix[104, 'val3']

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


'A'

In [33]:
dataset.ix[104, 2]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


'A'

In [34]:
dataset.iloc[4, 2]

'A'

In [35]:
dataset[['val3', 'val2']][0:2]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [36]:
dataset.loc[range(100, 102), ['val3', 'val2']]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [37]:
dataset.ix[range(100, 102), ['val3', 'val2']]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [38]:
dataset.ix[range(100, 102), [2, 1]]

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


In [39]:
dataset.iloc[range(2), [2,1]]

Unnamed: 0_level_0,val3,val2
n,Unnamed: 1_level_1,Unnamed: 2_level_1
100,C,10
101,C,20


## Working with categorical and text data

In [40]:
import pandas as pd
categorical_feature = pd.Series(['sunny', 'cloudy', 'snowy', 'rainy', 'foggy'])
mapping = pd.get_dummies(categorical_feature)
mapping

Unnamed: 0,cloudy,foggy,rainy,snowy,sunny
0,0,0,0,0,1
1,1,0,0,0,0
2,0,0,0,1,0
3,0,0,1,0,0
4,0,1,0,0,0


In [41]:
mapping['sunny']

0    1
1    0
2    0
3    0
4    0
Name: sunny, dtype: uint8

In [42]:
mapping['cloudy']

0    0
1    1
2    0
3    0
4    0
Name: cloudy, dtype: uint8

In [43]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
ohe = OneHotEncoder()
levels = ['sunny', 'cloudy', 'snowy', 'rainy', 'foggy']
fit_levs = le.fit_transform(levels)
ohe.fit([[fit_levs[0]], [fit_levs[1]], [fit_levs[2]], [fit_levs[3]],[fit_levs[4]]])
print (ohe.transform([le.transform(['sunny'])]).toarray())
print (ohe.transform([le.transform(['cloudy'])]).toarray())

[[0. 0. 0. 0. 1.]]
[[1. 0. 0. 0. 0.]]


### A special type of data – text

In [44]:
from sklearn.datasets import fetch_20newsgroups
categories = ['sci.med', 'sci.space']
twenty_sci_news = fetch_20newsgroups(categories=categories)

In [45]:
print(twenty_sci_news.data[0])

From: flb@flb.optiplan.fi ("F.Baube[tm]")
Subject: Vandalizing the sky
X-Added: Forwarded by Space Digest
Organization: [via International Space University]
Original-Sender: isu@VACATION.VENARI.CS.CMU.EDU
Distribution: sci
Lines: 12

From: "Phil G. Fraering" <pgf@srl03.cacs.usl.edu>
> 
> Finally: this isn't the Bronze Age, [..]
> please try to remember that there are more human activities than
> those practiced by the Warrior Caste, the Farming Caste, and the
> Priesthood.

Right, the Profiting Caste is blessed by God, and may 
 freely blare its presence in the evening twilight ..

-- 
* Fred Baube (tm)



In [46]:
 twenty_sci_news.filenames

array(['C:\\Users\\sandhyao\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\61116',
       'C:\\Users\\sandhyao\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.med\\58122',
       'C:\\Users\\sandhyao\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.med\\58903',
       ...,
       'C:\\Users\\sandhyao\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\60774',
       'C:\\Users\\sandhyao\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.space\\60954',
       'C:\\Users\\sandhyao\\scikit_learn_data\\20news_home\\20news-bydate-train\\sci.med\\58911'],
      dtype='<U98')

In [47]:
print (twenty_sci_news.target[0])

1


In [48]:
print (twenty_sci_news.target_names[twenty_sci_news.target[0]])

sci.space


In [49]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
word_count = count_vect.fit_transform(twenty_sci_news.data)
word_count.shape

(1187, 25638)

In [50]:
print (word_count[0])

  (0, 10778)	1
  (0, 23849)	1
  (0, 9796)	1
  (0, 12716)	1
  (0, 18586)	1
  (0, 13384)	1
  (0, 5134)	1
  (0, 10785)	1
  (0, 15246)	1
  (0, 11330)	1
  (0, 5148)	1
  (0, 13318)	1
  (0, 18744)	1
  (0, 20110)	1
  (0, 18642)	1
  (0, 3808)	2
  (0, 10188)	1
  (0, 6017)	3
  (0, 24930)	1
  (0, 18474)	1
  (0, 23241)	1
  (0, 23129)	1
  (0, 3191)	1
  (0, 12362)	1
  (0, 15968)	1
  :	:
  (0, 7646)	1
  (0, 24547)	1
  (0, 24415)	1
  (0, 13359)	1
  (0, 20909)	1
  (0, 17235)	1
  (0, 24151)	1
  (0, 13158)	1
  (0, 24626)	1
  (0, 17217)	1
  (0, 8438)	1
  (0, 21686)	2
  (0, 5650)	3
  (0, 10713)	1
  (0, 3233)	1
  (0, 21382)	1
  (0, 23137)	7
  (0, 24461)	1
  (0, 22345)	1
  (0, 23381)	2
  (0, 4762)	2
  (0, 10341)	1
  (0, 17170)	1
  (0, 10501)	2
  (0, 10827)	2


In [51]:
word_list = count_vect.get_feature_names()
for n in word_count[0].indices:
    print ('Word "%s" appears %i times' % (word_list[n], word_count[0, n]))

Word "fred" appears 1 times
Word "twilight" appears 1 times
Word "evening" appears 1 times
Word "in" appears 1 times
Word "presence" appears 1 times
Word "its" appears 1 times
Word "blare" appears 1 times
Word "freely" appears 1 times
Word "may" appears 1 times
Word "god" appears 1 times
Word "blessed" appears 1 times
Word "is" appears 1 times
Word "profiting" appears 1 times
Word "right" appears 1 times
Word "priesthood" appears 1 times
Word "and" appears 2 times
Word "farming" appears 1 times
Word "caste" appears 3 times
Word "warrior" appears 1 times
Word "practiced" appears 1 times
Word "those" appears 1 times
Word "than" appears 1 times
Word "activities" appears 1 times
Word "human" appears 1 times
Word "more" appears 1 times
Word "are" appears 1 times
Word "there" appears 1 times
Word "that" appears 1 times
Word "remember" appears 1 times
Word "to" appears 1 times
Word "try" appears 1 times
Word "please" appears 1 times
Word "age" appears 1 times
Word "bronze" appears 1 times
Wor

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer
tf_vect = TfidfVectorizer(use_idf=False, norm='l1')
word_freq = tf_vect.fit_transform(twenty_sci_news.data)
word_list = tf_vect.get_feature_names()
for n in word_freq[0].indices:
    print ('Word "%s" has frequency %0.3f' % (word_list[n], word_freq[0, n]))

Word "fred" has frequency 0.011
Word "twilight" has frequency 0.011
Word "evening" has frequency 0.011
Word "in" has frequency 0.011
Word "presence" has frequency 0.011
Word "its" has frequency 0.011
Word "blare" has frequency 0.011
Word "freely" has frequency 0.011
Word "may" has frequency 0.011
Word "god" has frequency 0.011
Word "blessed" has frequency 0.011
Word "is" has frequency 0.011
Word "profiting" has frequency 0.011
Word "right" has frequency 0.011
Word "priesthood" has frequency 0.011
Word "and" has frequency 0.022
Word "farming" has frequency 0.011
Word "caste" has frequency 0.033
Word "warrior" has frequency 0.011
Word "practiced" has frequency 0.011
Word "those" has frequency 0.011
Word "than" has frequency 0.011
Word "activities" has frequency 0.011
Word "human" has frequency 0.011
Word "more" has frequency 0.011
Word "are" has frequency 0.011
Word "there" has frequency 0.011
Word "that" has frequency 0.011
Word "remember" has frequency 0.011
Word "to" has frequency 0.0

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vect = TfidfVectorizer() # Default: use_idf=True
word_tfidf = tfidf_vect.fit_transform(twenty_sci_news.data)
word_list = tfidf_vect.get_feature_names()
for n in word_tfidf[0].indices:
    print ('Word "%s" has tf-idf %0.3f' % (word_list[n], word_tfidf[0, n]))

Word "from" has tf-idf 0.043
Word "flb" has tf-idf 0.264
Word "optiplan" has tf-idf 0.132
Word "fi" has tf-idf 0.110
Word "baube" has tf-idf 0.264
Word "tm" has tf-idf 0.219
Word "subject" has tf-idf 0.022
Word "vandalizing" has tf-idf 0.103
Word "the" has tf-idf 0.158
Word "sky" has tf-idf 0.091
Word "added" has tf-idf 0.088
Word "forwarded" has tf-idf 0.096
Word "by" has tf-idf 0.120
Word "space" has tf-idf 0.098
Word "digest" has tf-idf 0.095
Word "organization" has tf-idf 0.022
Word "via" has tf-idf 0.083
Word "international" has tf-idf 0.081
Word "university" has tf-idf 0.045
Word "original" has tf-idf 0.085
Word "sender" has tf-idf 0.093
Word "isu" has tf-idf 0.099
Word "vacation" has tf-idf 0.099
Word "venari" has tf-idf 0.103
Word "cs" has tf-idf 0.055
Word "cmu" has tf-idf 0.081
Word "edu" has tf-idf 0.059
Word "distribution" has tf-idf 0.053
Word "sci" has tf-idf 0.067
Word "lines" has tf-idf 0.022
Word "12" has tf-idf 0.076
Word "phil" has tf-idf 0.102
Word "fraering" has tf

In [54]:
text_1 = 'we love data science'
text_2 = 'data science is hard'
documents = [text_1, text_2]
documents

['we love data science', 'data science is hard']

In [55]:
count_vect_1_grams = CountVectorizer(ngram_range=(1, 1),stop_words=[], min_df=1)
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print ("Word list = ", word_list)
print ("text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices])

Word list =  ['data', 'hard', 'is', 'love', 'science', 'we']
text_1 is described with ['science(1)', 'data(1)', 'love(1)', 'we(1)']


In [56]:
count_vect_1_grams = CountVectorizer(ngram_range=(2, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print ("Word list = ", word_list)
print ("text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices])

Word list =  ['data science', 'is hard', 'love data', 'science is', 'we love']
text_1 is described with ['data science(1)', 'love data(1)', 'we love(1)']


In [57]:
count_vect_1_grams = CountVectorizer(ngram_range=(1, 2))
word_count = count_vect_1_grams.fit_transform(documents)
word_list = count_vect_1_grams.get_feature_names()
print ("Word list = ", word_list)
print ("text_1 is described with", [word_list[n] + "(" +
str(word_count[0, n]) + ")" for n in word_count[0].indices])

Word list =  ['data', 'data science', 'hard', 'is', 'is hard', 'love', 'love data', 'science', 'science is', 'we', 'we love']
text_1 is described with ['data science(1)', 'love data(1)', 'we love(1)', 'science(1)', 'data(1)', 'love(1)', 'we(1)']


In [58]:
from sklearn.feature_extraction.text import HashingVectorizer
hash_vect = HashingVectorizer(n_features=1000)
word_hashed = hash_vect.fit_transform(twenty_sci_news.data)
word_hashed.shape

(1187, 1000)

### Scraping the Web 

In [59]:
import urllib.request
url = 'https://en.wikipedia.org/wiki/William_Shakespeare'
request = urllib.request.Request(url)
response = urllib.request.urlopen(request)

In [60]:
from bs4 import BeautifulSoup
soup = BeautifulSoup(response, 'html.parser')

In [61]:
soup.title

<title>William Shakespeare - Wikipedia</title>

In [62]:
section = soup.find_all(id='mw-normal-catlinks')[0]
for catlink in section.find_all("a")[1:]:
    print(catlink.get("title"), "->", catlink.get("href"))

Category:Sonnets by William Shakespeare -> /wiki/Category:Sonnets_by_William_Shakespeare
Category:William Shakespeare -> /wiki/Category:William_Shakespeare
Category:1564 births -> /wiki/Category:1564_births
Category:1616 deaths -> /wiki/Category:1616_deaths
Category:16th-century English male actors -> /wiki/Category:16th-century_English_male_actors
Category:English male stage actors -> /wiki/Category:English_male_stage_actors
Category:16th-century English writers -> /wiki/Category:16th-century_English_writers
Category:17th-century English writers -> /wiki/Category:17th-century_English_writers
Category:16th-century dramatists and playwrights -> /wiki/Category:16th-century_dramatists_and_playwrights
Category:17th-century English dramatists and playwrights -> /wiki/Category:17th-century_English_dramatists_and_playwrights
Category:16th-century English poets -> /wiki/Category:16th-century_English_poets
Category:Burials in Warwickshire -> /wiki/Category:Burials_in_Warwickshire
Category:Peopl

## Data processing with NumPy

### From lists to unidimensional arrays

In [63]:
import numpy as np
list_of_ints = [1,2,3]
Array_1 = np.array(list_of_ints)
Array_1

array([1, 2, 3])

In [64]:
Array_1[1]

2

In [65]:
type(Array_1)

numpy.ndarray

In [66]:
Array_1.dtype

dtype('int32')

### Controlling the memory size

In [67]:
import numpy as np
Array_1.nbytes

12

In [68]:
Array_1 = np.array(list_of_ints, dtype= 'int8')

In [69]:
Array_1b = Array_1.astype('float32')
Array_1b

array([1., 2., 3.], dtype=float32)

### Heterogeneous lists

In [70]:
import numpy as np

complex_list = [1,2,3] + [1.,2.,3.] + ['a','b','c']

Array_2 = np.array(complex_list[:3]) 
print ('complex_list[:3]', Array_2.dtype)

Array_2 = np.array(complex_list[:6]) 
print ('complex_list[:6]', Array_2.dtype)

Array_2 = np.array(complex_list) 
print ('complex_list[:] ',Array_2.dtype)

complex_list[:3] int32
complex_list[:6] float64
complex_list[:]  <U32


In [71]:
print (isinstance(Array_2[0],np.number))

False


### From lists to multidimensional arrays

In [72]:
import numpy as np

a_list_of_lists = [[1,2,3],[4,5,6],[7,8,9]]
Array_2D = np.array(a_list_of_lists )
Array_2D

array([[1, 2, 3],
       [4, 5, 6],
       [7, 8, 9]])

In [73]:
Array_2D[1,1]

5

In [74]:
a_list_of_lists_of_lists = [[[1,2],[3,4],[5,6]],[[7,8],[9,10],[11,12]]]
Array_3D = np.array(a_list_of_lists_of_lists)
Array_3D

array([[[ 1,  2],
        [ 3,  4],
        [ 5,  6]],

       [[ 7,  8],
        [ 9, 10],
        [11, 12]]])

In [75]:
Array_3D[0,2,0]

5

In [76]:
np.array({1:2,3:4,5:6}.items())

array(dict_items([(1, 2), (3, 4), (5, 6)]), dtype=object)

### Resizing arrays

In [77]:
import numpy as np

original_array = np.array([1, 2, 3, 4, 5, 6, 7, 8])
Array_a = original_array.reshape(4,2)
Array_b = original_array.reshape(4,2).copy()
Array_c = original_array.reshape(2,2,2)

original_array[0] = -1

In [78]:
Array_a

array([[-1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8]])

In [79]:
Array_c

array([[[-1,  2],
        [ 3,  4]],

       [[ 5,  6],
        [ 7,  8]]])

In [80]:
Array_b

array([[1, 2],
       [3, 4],
       [5, 6],
       [7, 8]])

In [81]:
original_array.resize(4,2)

In [82]:
original_array

array([[-1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8]])

In [83]:
original_array.shape = (4,2)
original_array

array([[-1,  2],
       [ 3,  4],
       [ 5,  6],
       [ 7,  8]])

### Arrays derived from NumPy functions

In [84]:
import numpy as np
ordinal_values = np.arange(9).reshape(3,3)
ordinal_values

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

In [85]:
np.arange(9)[::-1]

array([8, 7, 6, 5, 4, 3, 2, 1, 0])

In [86]:
np.random.randint(low=1,high=10,size=(3,3)).reshape(3,3)

array([[2, 3, 7],
       [8, 3, 8],
       [3, 4, 7]])

In [87]:
np.zeros((3,3))

array([[0., 0., 0.],
       [0., 0., 0.],
       [0., 0., 0.]])

In [88]:
np.ones((3,3))

array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [89]:
 np.eye(3)

array([[1., 0., 0.],
       [0., 1., 0.],
       [0., 0., 1.]])

In [90]:
fractions = np.linspace(start=0, stop=1, num=10)
fractions

array([0.        , 0.11111111, 0.22222222, 0.33333333, 0.44444444,
       0.55555556, 0.66666667, 0.77777778, 0.88888889, 1.        ])

In [91]:
growth = np.logspace(start=0, stop=1, num=10, base=10.0)
growth

array([ 1.        ,  1.29154967,  1.66810054,  2.15443469,  2.7825594 ,
        3.59381366,  4.64158883,  5.9948425 ,  7.74263683, 10.        ])

In [92]:
std_gaussian = np.random.normal(size=(3,3))
std_gaussian

array([[-0.82393328, -1.84652917,  1.34528427],
       [-1.24531382, -0.11917922,  0.52643134],
       [-0.18037   ,  0.90771631,  0.45041336]])

In [93]:
gaussian = np.random.normal(loc=1.0, scale= 3.0, size=(3,3))
gaussian

array([[ 2.12890952,  1.6062752 , -0.30181925],
       [ 0.43292372,  1.82224036, -0.29198829],
       [-3.27167516,  0.21175283, -3.694554  ]])

In [94]:
rand = np.random.uniform(low=0.0, high=1.0, size=(3,3))
rand

array([[0.50979009, 0.33400527, 0.6826128 ],
       [0.49119094, 0.83299134, 0.19607752],
       [0.42899812, 0.94369131, 0.61419907]])

### Getting an array directly from a file

In [95]:
import numpy as np
housing = np.loadtxt('regression-datasets-housing.csv',delimiter=',', dtype=float)

In [96]:
np.loadtxt('datasets-uci-iris.csv',delimiter=',',dtype=float)

ValueError: could not convert string to float: 'Iris-setosa'

### Extracting data from pandas

In [97]:
import pandas as pd
import numpy as np
housing_filename = 'regression-datasets-housing.csv'
housing = pd.read_csv(housing_filename, header=None)

In [98]:
housing_array = housing.values
housing_array.dtype

dtype('float64')

In [99]:
housing.dtypes

0     float64
1       int64
2     float64
3       int64
4     float64
5     float64
6     float64
7     float64
8       int64
9       int64
10      int64
11    float64
12    float64
13    float64
dtype: object

## NumPy's fast operations and computations

In [1]:
import numpy as np
a = np.arange(5).reshape(1,5)
a += 1
a*a

array([[ 1,  4,  9, 16, 25]])

In [2]:
a = np.arange(5).reshape(1,5) + 1
b = np.arange(5).reshape(5,1) + 1
a * b

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

In [3]:
a2 = np.array([1,2,3,4,5] * 5).reshape(5,5)
b2 = a2.T
a2 * b2

array([[ 1,  2,  3,  4,  5],
       [ 2,  4,  6,  8, 10],
       [ 3,  6,  9, 12, 15],
       [ 4,  8, 12, 16, 20],
       [ 5, 10, 15, 20, 25]])

In [4]:
print (a2)

[[1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]
 [1 2 3 4 5]]


In [5]:
np.sum(a2, axis=0)

array([ 5, 10, 15, 20, 25])

In [6]:
np.sum(a2, axis=1)

array([15, 15, 15, 15, 15])

In [7]:
%timeit -n 1 -r 3 [i+1.0 for i in range(10**6)]
%timeit -n 1 -r 3 np.arange(10**6)+1.0

1 loop, best of 3: 120 ms per loop
1 loop, best of 3: 6.54 ms per loop


In [8]:
import math
%timeit -n 1 -r 3 [math.sqrt(i) for i in range(10**6)]

1 loop, best of 3: 252 ms per loop


In [9]:
%timeit -n 1 -r 3 np.sqrt(np.arange(10**6))

1 loop, best of 3: 12.6 ms per loop


### Matrix operations

In [10]:
import numpy as np
M = np.arange(5*5, dtype=float).reshape(5,5)
M

array([[ 0.,  1.,  2.,  3.,  4.],
       [ 5.,  6.,  7.,  8.,  9.],
       [10., 11., 12., 13., 14.],
       [15., 16., 17., 18., 19.],
       [20., 21., 22., 23., 24.]])

In [11]:
coefs = np.array([1., 0.5, 0.5, 0.5, 0.5])
coefs_matrix = np.column_stack((coefs,coefs[::-1]))
print (coefs_matrix)

[[1.  0.5]
 [0.5 0.5]
 [0.5 0.5]
 [0.5 0.5]
 [0.5 1. ]]


In [13]:
np.dot(M,coefs)

array([ 5., 20., 35., 50., 65.])

In [14]:
np.dot(coefs,M)

array([25., 28., 31., 34., 37.])

In [15]:
np.dot(M,coefs_matrix)

array([[ 5.,  7.],
       [20., 22.],
       [35., 37.],
       [50., 52.],
       [65., 67.]])

### Slicing and indexing with NumPy arrays

In [16]:
import numpy as np
M = np.arange(100, dtype=int).reshape(10,10)

In [17]:
M

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
       [90, 91, 92, 93, 94, 95, 96, 97, 98, 99]])

In [18]:
# [start_index_included:end_index_exclude:steps]

In [19]:
M[2:9:2,:]

array([[20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
       [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
       [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
       [80, 81, 82, 83, 84, 85, 86, 87, 88, 89]])

In [20]:
M[2:9:2,5:]

array([[25, 26, 27, 28, 29],
       [45, 46, 47, 48, 49],
       [65, 66, 67, 68, 69],
       [85, 86, 87, 88, 89]])

In [21]:
 M[2:9:2,5::-1]

array([[25, 24, 23, 22, 21, 20],
       [45, 44, 43, 42, 41, 40],
       [65, 64, 63, 62, 61, 60],
       [85, 84, 83, 82, 81, 80]])

In [22]:
row_index = (M[:,0]>=20) & (M[:,0]<=80)
col_index = M[0,:]>=5
M[row_index,:][:,col_index]

array([[25, 26, 27, 28, 29],
       [35, 36, 37, 38, 39],
       [45, 46, 47, 48, 49],
       [55, 56, 57, 58, 59],
       [65, 66, 67, 68, 69],
       [75, 76, 77, 78, 79],
       [85, 86, 87, 88, 89]])

In [23]:
mask = (M>=20) & (M<=90) & ((M / 10.) % 1 >= 0.5)
M[mask]

array([25, 26, 27, 28, 29, 35, 36, 37, 38, 39, 45, 46, 47, 48, 49, 55, 56,
       57, 58, 59, 65, 66, 67, 68, 69, 75, 76, 77, 78, 79, 85, 86, 87, 88,
       89])

In [24]:
row_index = [1,1,2,7]
col_index = [0,2,4,8]

In [25]:
M[row_index,col_index]

array([10, 12, 24, 78])

In [26]:
M[row_index,:][:,col_index]

array([[10, 12, 14, 18],
       [10, 12, 14, 18],
       [20, 22, 24, 28],
       [70, 72, 74, 78]])

In [27]:
N = M[2:9:2,5:].copy()

In [28]:
N

array([[25, 26, 27, 28, 29],
       [45, 46, 47, 48, 49],
       [65, 66, 67, 68, 69],
       [85, 86, 87, 88, 89]])

### Stacking NumPy arrays

In [29]:
import numpy as np
dataset = np.arange(50).reshape(10,5)

In [30]:
dataset

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49]])

In [31]:
single_line = np.arange(1*5).reshape(1,5)
a_few_lines = np.arange(3*5).reshape(3,5)

In [34]:
np.vstack((dataset,single_line))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [ 0,  1,  2,  3,  4]])

In [35]:
np.vstack((dataset,a_few_lines))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [36]:
np.vstack((dataset,single_line,single_line))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49],
       [ 0,  1,  2,  3,  4],
       [ 0,  1,  2,  3,  4]])

In [37]:
bias = np.ones(10).reshape(10,1)
np.hstack((dataset,bias))

array([[ 0.,  1.,  2.,  3.,  4.,  1.],
       [ 5.,  6.,  7.,  8.,  9.,  1.],
       [10., 11., 12., 13., 14.,  1.],
       [15., 16., 17., 18., 19.,  1.],
       [20., 21., 22., 23., 24.,  1.],
       [25., 26., 27., 28., 29.,  1.],
       [30., 31., 32., 33., 34.,  1.],
       [35., 36., 37., 38., 39.,  1.],
       [40., 41., 42., 43., 44.,  1.],
       [45., 46., 47., 48., 49.,  1.]])

In [39]:
bias = np.ones(10)
np.column_stack((dataset,bias))

array([[ 0.,  1.,  2.,  3.,  4.,  1.],
       [ 5.,  6.,  7.,  8.,  9.,  1.],
       [10., 11., 12., 13., 14.,  1.],
       [15., 16., 17., 18., 19.,  1.],
       [20., 21., 22., 23., 24.,  1.],
       [25., 26., 27., 28., 29.,  1.],
       [30., 31., 32., 33., 34.,  1.],
       [35., 36., 37., 38., 39.,  1.],
       [40., 41., 42., 43., 44.,  1.],
       [45., 46., 47., 48., 49.,  1.]])

In [40]:
np.dstack((dataset*1,dataset*2,dataset*3))

array([[[  0,   0,   0],
        [  1,   2,   3],
        [  2,   4,   6],
        [  3,   6,   9],
        [  4,   8,  12]],

       [[  5,  10,  15],
        [  6,  12,  18],
        [  7,  14,  21],
        [  8,  16,  24],
        [  9,  18,  27]],

       [[ 10,  20,  30],
        [ 11,  22,  33],
        [ 12,  24,  36],
        [ 13,  26,  39],
        [ 14,  28,  42]],

       [[ 15,  30,  45],
        [ 16,  32,  48],
        [ 17,  34,  51],
        [ 18,  36,  54],
        [ 19,  38,  57]],

       [[ 20,  40,  60],
        [ 21,  42,  63],
        [ 22,  44,  66],
        [ 23,  46,  69],
        [ 24,  48,  72]],

       [[ 25,  50,  75],
        [ 26,  52,  78],
        [ 27,  54,  81],
        [ 28,  56,  84],
        [ 29,  58,  87]],

       [[ 30,  60,  90],
        [ 31,  62,  93],
        [ 32,  64,  96],
        [ 33,  66,  99],
        [ 34,  68, 102]],

       [[ 35,  70, 105],
        [ 36,  72, 108],
        [ 37,  74, 111],
        [ 38,  76, 114],
        [ 3

In [41]:
np.insert(dataset, 3, bias, axis=1)

array([[ 0,  1,  2,  1,  3,  4],
       [ 5,  6,  7,  1,  8,  9],
       [10, 11, 12,  1, 13, 14],
       [15, 16, 17,  1, 18, 19],
       [20, 21, 22,  1, 23, 24],
       [25, 26, 27,  1, 28, 29],
       [30, 31, 32,  1, 33, 34],
       [35, 36, 37,  1, 38, 39],
       [40, 41, 42,  1, 43, 44],
       [45, 46, 47,  1, 48, 49]])

In [42]:
np.insert(dataset, 3, dataset.T, axis=1)

array([[ 0,  1,  2,  0,  1,  2,  3,  4,  3,  4],
       [ 5,  6,  7,  5,  6,  7,  8,  9,  8,  9],
       [10, 11, 12, 10, 11, 12, 13, 14, 13, 14],
       [15, 16, 17, 15, 16, 17, 18, 19, 18, 19],
       [20, 21, 22, 20, 21, 22, 23, 24, 23, 24],
       [25, 26, 27, 25, 26, 27, 28, 29, 28, 29],
       [30, 31, 32, 30, 31, 32, 33, 34, 33, 34],
       [35, 36, 37, 35, 36, 37, 38, 39, 38, 39],
       [40, 41, 42, 40, 41, 42, 43, 44, 43, 44],
       [45, 46, 47, 45, 46, 47, 48, 49, 48, 49]])

In [43]:
np.insert(dataset, 3, np.ones(5), axis=0)

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [ 1,  1,  1,  1,  1],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29],
       [30, 31, 32, 33, 34],
       [35, 36, 37, 38, 39],
       [40, 41, 42, 43, 44],
       [45, 46, 47, 48, 49]])