# Deep Learning for NLP


## Imports/Installations required:
1. NUMPY
2. PANDAS
3. SCIPY
4. TENSORFLOW
5. KERAS
6. SPACY
7. NLTK
8. TEXTBLOB
9. GENSIM
10. PYTORCH

## Numpy

In [1]:
import numpy as np

In [2]:
a = np.array([1,2,5,3], float)  # numpy array with float variable

In [3]:
a

array([1., 2., 5., 3.])

In [4]:
print(type(a))  # Type of variable

<class 'numpy.ndarray'>


In [5]:
# operations on array
a[2] = 8  # the value in the 2 index has changed to 8
print(a)

[1. 2. 8. 3.]


In [6]:
b = np.array([[1,3,2,4],[6,7,8,5]], float) # 2d numpy array 

In [7]:
b[1,1] # second element of second array

7.0

In [8]:
b[0,-1] # fetches last element of the first array

4.0

In [9]:
print(a.shape) # shape of array a

(4,)


In [10]:
print(b.shape) #shape of array with its tuple 

(2, 4)


In [11]:
a.dtype  # the type of value stored

dtype('float64')

In [12]:
b.dtype

dtype('float64')

In [13]:
c =  np.array([1,4,32,45,2])  # int variable

In [14]:
c

array([ 1,  4, 32, 45,  2])

In [15]:
type(c)

numpy.ndarray

In [16]:
c.dtype

dtype('int32')

In [17]:
c.shape

(5,)

In [18]:
print(len(a)) # length of a 
print(len(b)) # length of b
print(len(c)) # length of c

4
2
5


In [19]:
# boolean
8 in c # false since 8 is not c

False

In [20]:
32 in c # True since 32 is in c

True

In [21]:
# use of reshape: tranforms element from 1D to 2D
d = np.array(range(12), float)

In [22]:
print(d) # element with the range of 12

[ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11.]


In [23]:
print(d.shape)

(12,)


In [24]:
print('----')

----


In [25]:
d = d.reshape((2,6)) # the reshaping of d

In [26]:
d 

array([[ 0.,  1.,  2.,  3.,  4.,  5.],
       [ 6.,  7.,  8.,  9., 10., 11.]])

In [27]:
d.shape

(2, 6)

In [28]:
d = d.reshape((3,4))
print(d)
d.shape

[[ 0.  1.  2.  3.]
 [ 4.  5.  6.  7.]
 [ 8.  9. 10. 11.]]


(3, 4)

In [29]:
d.fill(1) # the whole array gets filled with 1
print(d)

[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [30]:
e = np.array(range(4), float)
e = e.reshape(2,2)
e

array([[0., 1.],
       [2., 3.]])

In [31]:
e.transpose() # creates transpose of arrays, not done in place
e

array([[0., 1.],
       [2., 3.]])

In [32]:
e.flatten() # flattens the whole array, not done in place 

array([0., 1., 2., 3.])

In [33]:
# Concatenation of 2 or more arrays
p = np.array([1,2], float)
q = np.array([3,4,5,6], float)
r = np.concatenate((p,q))
print(r)

[1. 2. 3. 4. 5. 6.]


In [34]:
print(r)

[1. 2. 3. 4. 5. 6.]


In [35]:
print(r.shape)

(6,)


In [36]:
#newaxis : to increase the dimensonality of the array
k = np.array([1,2,3,4,5,2,45,2,3,2], float)
              
k[:,np.newaxis].shape

(10, 1)

In [37]:
# multiplication is done element wise and not as matrix multiplicaion
# if the size of the arrays do not match then the smaller one is repeated to perform the desired operation
# addition, subtraction, multiplication.

In [38]:
a1 = np.array([[1,2],[3,4],[5,6]], float)
a2 = np.array([-1,5], float)
a3 = a1+a2
print(a3)

[[ 0.  7.]
 [ 2.  9.]
 [ 4. 11.]]


In [39]:
# numpy and use of sum, prod, var, mean, sort, unique, argmax, argmin, std

a4 = np.array([[0,2],[4,-1],[3,7],[8,4]], float)
print(a4.mean(axis=0))   # Mean of elements column wise

[3.75 3.  ]


In [40]:
a4.mean(axis=1)   # Mean of elements row wise

array([1. , 1.5, 5. , 6. ])

In [41]:
a5 = np.array([1,3,2], float)
np.where(a!=0, 1/a, a)

array([1.        , 0.5       , 0.125     , 0.33333333])

In [42]:
np.random.rand(2,3)  # random elements are pulled

array([[0.37392302, 0.9647317 , 0.21134717],
       [0.90830714, 0.10549539, 0.59587899]])

## Pandas

In [43]:
import pandas as pd

Pandas is used for data analysis, preprocessing datasets. Dataframe and Series are two of its major data structures. Series is 1-dimensional indexed array, and DataFrame is a tabular Data structre with column and row-level indexes. 

In [44]:
series1 = pd.Series([2,5,7,3,6]) # Creating a series object

In [45]:
print(series1)  # 0 1 2 3 4 are default series 

0    2
1    5
2    7
3    3
4    6
dtype: int64


In [46]:
print(series1.values) # print the object series

[2 5 7 3 6]


In [47]:
series1.index   # Defualt index of the series object

RangeIndex(start=0, stop=5, step=1)

In [48]:
series1.index = ['a', 'b', 'c', 'd', 'e'] # setting ondex of the series object

In [49]:
series1['e']  # gets the element of the index

6

In [50]:
## Creating a dataframe using pandas

class_data = {'Names':['Tom', 'Ken', 'Tak'],
              'Age':[27,26,23],
              'Occupation':['Musician','Voice Artist','Teacher']}

class_df = pd.DataFrame(class_data, index = ['Person1', 'Person2', 'Person3'],
                       columns = ['Names', 'Age', 'Occupation'])

In [51]:
print(class_df)

        Names  Age    Occupation
Person1   Tom   27      Musician
Person2   Ken   26  Voice Artist
Person3   Tak   23       Teacher


In [52]:
class_df.Names

Person1    Tom
Person2    Ken
Person3    Tak
Name: Names, dtype: object

In [53]:
class_df.Occupation

Person1        Musician
Person2    Voice Artist
Person3         Teacher
Name: Occupation, dtype: object

In [54]:
# Adding new entry to the DataFrame

#mport numpy as np
class_data2 = {'Names':['Meg', 'Bryce'],
               'Age': [28,29],
               'Occupation': ['Doctor', 'Researcher']}
class_df2 = pd.DataFrame(class_data2,index = ['Person4', 'Person5'],
                       columns = ['Names', 'Age', 'Occupation'])

In [55]:
print(class_df2)

         Names  Age  Occupation
Person4    Meg   28      Doctor
Person5  Bryce   29  Researcher


In [56]:
class_df = class_df.append(class_df2) # adding new entry

In [57]:
print(class_df)

         Names  Age    Occupation
Person1    Tom   27      Musician
Person2    Ken   26  Voice Artist
Person3    Tak   23       Teacher
Person4    Meg   28        Doctor
Person5  Bryce   29    Researcher


In [58]:
class_df.T # T is for transpose

Unnamed: 0,Person1,Person2,Person3,Person4,Person5
Names,Tom,Ken,Tak,Meg,Bryce
Age,27,26,23,28,29
Occupation,Musician,Voice Artist,Teacher,Doctor,Researcher


In [59]:
class_df.sort_values(by='Age') # Sorting of rows by one column

Unnamed: 0,Names,Age,Occupation
Person3,Tak,23,Teacher
Person2,Ken,26,Voice Artist
Person1,Tom,27,Musician
Person4,Meg,28,Doctor
Person5,Bryce,29,Researcher


In [60]:
# Adding one or more column to the dataframe as series object
column_entry = pd.Series(['Independent Artist', 'Animation Industry', 'Junior Level', 'Clinic', np.nan],
                        index = ['Person1', 'Person2', 'Person3', 'Person4', 'Person5'])
class_df['Position'] = column_entry

In [61]:
print(class_df)

         Names  Age    Occupation            Position
Person1    Tom   27      Musician  Independent Artist
Person2    Ken   26  Voice Artist  Animation Industry
Person3    Tak   23       Teacher        Junior Level
Person4    Meg   28        Doctor              Clinic
Person5  Bryce   29    Researcher                 NaN


In [62]:
# Filling the entries in the missing dataframe
class_df.fillna('Phd Student', inplace = True)

In [63]:
print(class_df)

         Names  Age    Occupation            Position
Person1    Tom   27      Musician  Independent Artist
Person2    Ken   26  Voice Artist  Animation Industry
Person3    Tak   23       Teacher        Junior Level
Person4    Meg   28        Doctor              Clinic
Person5  Bryce   29    Researcher         Phd Student


In [64]:
# Concatenation of 2 dataframes

Gender  = pd.DataFrame(data = {'Gender': ['Male', 'Male', 'Male', 'Female', 'Female']}, 
                       index = ['Person1', 'Person2', 'Person3', 'Person4', 'Person5'])

In [65]:
print(Gender)

         Gender
Person1    Male
Person2    Male
Person3    Male
Person4  Female
Person5  Female


In [66]:
class_data = pd.concat([class_df, Gender], axis = 1)

In [67]:
class_data

Unnamed: 0,Names,Age,Occupation,Position,Gender
Person1,Tom,27,Musician,Independent Artist,Male
Person2,Ken,26,Voice Artist,Animation Industry,Male
Person3,Tak,23,Teacher,Junior Level,Male
Person4,Meg,28,Doctor,Clinic,Female
Person5,Bryce,29,Researcher,Phd Student,Female


In [68]:
class_data['Occupation'] = class_data['Occupation'].map(lambda x:x + '  Pro')

In [69]:
class_data['Occupation']

Person1        Musician  Pro
Person2    Voice Artist  Pro
Person3         Teacher  Pro
Person4          Doctor  Pro
Person5      Researcher  Pro
Name: Occupation, dtype: object

In [70]:
# APPLY Function

def age_add(x):   #defining a new function which will increment the age by 1
    
    return(x+1)


print('----Old Values----')
print(class_data['Age'])

print('||||||||||||||||||||')

print('----New Values----')
print(class_data['Age'].apply(age_add)) # apply the age function on top of the age column

----Old Values----
Person1    27
Person2    26
Person3    23
Person4    28
Person5    29
Name: Age, dtype: int64
||||||||||||||||||||
----New Values----
Person1    28
Person2    27
Person3    24
Person4    29
Person5    30
Name: Age, dtype: int64


In [71]:
# Changing datatype of the column

class_data['Age'] = class_data['Age'].astype('category')
class_data.Age.dtypes

CategoricalDtype(categories=[23, 26, 27, 28, 29], ordered=False)

In [72]:
# Storing the results
class_data.to_csv('class_dataset.csv', index=False)

In [73]:
class_data.to_csv

<bound method NDFrame.to_csv of          Names Age         Occupation            Position  Gender
Person1    Tom  27      Musician  Pro  Independent Artist    Male
Person2    Ken  26  Voice Artist  Pro  Animation Industry    Male
Person3    Tak  23       Teacher  Pro        Junior Level    Male
Person4    Meg  28        Doctor  Pro              Clinic  Female
Person5  Bryce  29    Researcher  Pro         Phd Student  Female>

## SciPy

In [74]:
import scipy

In [75]:
from scipy import linalg

In [76]:
matrix = np.array([[6,3,1], [3,8,2], [5,2,9]]) # matrix creation
print(matrix)

[[6 3 1]
 [3 8 2]
 [5 2 9]]


In [77]:
linalg.det(matrix) # determinant of the above matrix

322.99999999999994

In [78]:
linalg.inv(matrix)  # inverse of the above matrix

array([[ 0.21052632, -0.07739938, -0.00619195],
       [-0.05263158,  0.15170279, -0.02786378],
       [-0.10526316,  0.00928793,  0.12074303]])

In [79]:
matrix1 = np.array([[1,2],[4,5]])
print(matrix1)

[[1 2]
 [4 5]]


In [80]:
linalg.det(matrix1)

-3.0

In [81]:
inverse_matrix1 = linalg.inv(matrix1)  # inverse of a matrix
print(inverse_matrix1)

[[-1.66666667  0.66666667]
 [ 1.33333333 -0.33333333]]


In [82]:
# SVD = singular value decomposition

comp_1, comp_2, comp_3 = linalg.svd(matrix)
comp_1
comp_2
comp_3

array([[-0.59103119, -0.53865849, -0.6004408 ],
       [-0.02091116, -0.73388316,  0.67895377],
       [-0.80637761,  0.41383877,  0.42248387]])

In [83]:
linalg.svd(matrix1)

(array([[-0.32453643, -0.9458732 ],
        [-0.9458732 ,  0.32453643]]),
 array([6.76782894, 0.44327362]),
 array([[-0.60699365, -0.79470668],
        [ 0.79470668, -0.60699365]]))

In [84]:
# Scipy stats package is huge , consists of statistical dist, functions, for operation on different kind of datasets.
# Scipy Stats module
from scipy import stats

In [85]:
# Generating a random sample of size 20 from normal dist with mean 3 and std 5

rvs_20 = stats.norm.rvs(3,5, size = 20)   # pattern is mean, std, sample size
print(rvs_20, '\n ---')

[ 0.0682866   7.06367621 -5.00860282 -3.8005583   3.07589963 -2.32056789
  5.94880771  6.71785044 -1.39387827  5.31387995  5.44932903  5.99110922
  4.42294243 -1.89317108 -4.7826805   5.35167207  0.38567439  2.47045403
  5.52847115  1.02660659] 
 ---


In [86]:
rvs1 = stats.norm.rvs(4,7, size = 50)
rvs1

array([  0.06604181,  10.32841501,  -0.52811056,  -6.86978651,
         3.32831194,   8.37539579,   9.2966105 ,  -1.74852638,
        14.50316936,  -6.02387686,   9.08367715,  -1.64207928,
         6.35250721,   4.85309784,   8.57725527,   7.36172024,
        -1.86157444,  -6.92529063,  -8.07750395,   7.60934104,
        -7.11340744,  -5.1441391 ,   5.41729455,   7.31540269,
        12.0440387 ,   3.62077434,  10.8545088 ,   8.52791829,
         7.93452144,   5.29453426,  -0.13504294,  -2.74796887,
        -8.73516956,  -4.84435989,   2.61159787,  -1.77217451,
         5.96296344,   5.34968657,   6.25462202,  10.81259643,
         4.42594395,  -1.36830633,  -6.84984688, -10.41424101,
        17.30540098,  11.73767122,   5.55668468,   5.64439053,
         9.04346329,  -4.89423015])

In [87]:
# Computing CDF of Beta dist with a = 100, b = 130, as shape parameters at random variables 0.41

cdf1 = scipy.stats.beta.cdf(0.41,a=100,b = 130)
print(cdf1)

0.22500957436155394


In [88]:
cdf2 = scipy.stats.beta.cdf(0.4,a=10,b = 30)
print(cdf2)

0.9795149123289616


In [89]:
# Scipy is a bit complex to handle and needs a bit more learning time.

# PLease to the Deep Learning For NLP by Palash Goyal, Sumit Pandey, Karan Jain.