# Chapter 7: Identifying Patterns in Text Using Machine Learning

## Data Preprocessing

### Importing the dataset and checking it to gain a basic understanding 

In [1]:
import seaborn as sns
tips_df = sns.load_dataset('tips')
tips_df.head()

handle: C:\Users\jeevi\seaborn-data\tips.csv
C:\Users\jeevi\seaborn-data\tips.csv


Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,Female,No,Sun,Dinner,2
1,10.34,1.66,Male,No,Sun,Dinner,3
2,21.01,3.5,Male,No,Sun,Dinner,3
3,23.68,3.31,Male,No,Sun,Dinner,2
4,24.59,3.61,Female,No,Sun,Dinner,4


### NaN values

#### Using is_null() to see if there are any NaN values in the dataset

In [2]:
tips_df.isnull().values.any()

False

#### Determining which column or which row contains the NaN values

In [3]:
tips_df.isnull().any()

total_bill    False
tip           False
sex           False
smoker        False
day           False
time          False
size          False
dtype: bool

#### Scanning the data for NaN values along the rows

In [4]:
tips_df.isnull().any(axis=1)

0      False
1      False
2      False
3      False
4      False
       ...  
239    False
240    False
241    False
242    False
243    False
Length: 244, dtype: bool

### Label encoding and one-hot encoding

#### Performing label encoding

In [5]:
#transforming non-numeric values into numeric
from sklearn.preprocessing import LabelEncoder
label_encoding = LabelEncoder()
tips_df.iloc[:,[2,3,4,5]] = tips_df.iloc[:,[2,3,4,5]].apply(label_encoding.fit_transform)

In [6]:
#checking the new dataset after encoding
tips_df

Unnamed: 0,total_bill,tip,sex,smoker,day,time,size
0,16.99,1.01,0,0,2,0,2
1,10.34,1.66,1,0,2,0,3
2,21.01,3.50,1,0,2,0,3
3,23.68,3.31,1,0,2,0,2
4,24.59,3.61,0,0,2,0,4
...,...,...,...,...,...,...,...
239,29.03,5.92,1,0,1,0,3
240,27.18,2.00,0,1,1,0,2
241,22.67,2.00,1,1,1,0,2
242,17.82,1.75,1,0,1,0,2


In [7]:
label_encoding = LabelEncoder()

#mapping non-numeric values to encoded value using the fit function on the relevant column
col_fit = label_encoding.fit(tips_df["day"])

#printing out the unique values for that column, as well as the corresponding encoding using the transform() method
dict(zip(col_fit.classes_, col_fit.transform(col_fit.classes_)))

{0: 0, 1: 1, 2: 2, 3: 3}

#### Performing one-hot encoding for proper representation of categorical data

In [8]:
#importing required libraries 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

#performing one hot encoding on column 2, 3, 4 and 5
oh_encoding = ColumnTransformer([('OneHotEncoding', OneHotEncoder(), [2,3,4,5])],remainder='passthrough')

#using fit_transform() function to the DataFrame and store the output as an array
tips_df_ohe = oh_encoding.fit_transform(tips_df)
tips_df_ohe

array([[ 1.  ,  0.  ,  1.  , ..., 16.99,  1.01,  2.  ],
       [ 0.  ,  1.  ,  1.  , ..., 10.34,  1.66,  3.  ],
       [ 0.  ,  1.  ,  1.  , ..., 21.01,  3.5 ,  3.  ],
       ...,
       [ 0.  ,  1.  ,  0.  , ..., 22.67,  2.  ,  2.  ],
       [ 0.  ,  1.  ,  1.  , ..., 17.82,  1.75,  2.  ],
       [ 1.  ,  0.  ,  1.  , ..., 18.78,  3.  ,  2.  ]])

### Data Standardization

#### Min-Max Standardization

In [9]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()
tips_df_std = minmax.fit_transform(tips_df_ohe)
tips_df_std

array([[1.        , 0.        , 1.        , ..., 0.29157939, 0.00111111,
        0.2       ],
       [0.        , 1.        , 1.        , ..., 0.1522832 , 0.07333333,
        0.4       ],
       [0.        , 1.        , 1.        , ..., 0.3757855 , 0.27777778,
        0.4       ],
       ...,
       [0.        , 1.        , 0.        , ..., 0.41055718, 0.11111111,
        0.2       ],
       [0.        , 1.        , 1.        , ..., 0.30896523, 0.08333333,
        0.2       ],
       [1.        , 0.        , 1.        , ..., 0.32907415, 0.22222222,
        0.2       ]])

#### Z-score standardization

In [10]:
from sklearn.preprocessing import StandardScaler
zs = StandardScaler()
tips_df_std = zs.fit_transform(tips_df_ohe)
tips_df_std

array([[ 1.34335316e+00, -1.34335316e+00,  7.84789169e-01, ...,
        -3.14711305e-01, -1.43994695e+00, -6.00192629e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
        -1.06323531e+00, -9.69205340e-01,  4.53382921e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
         1.37779900e-01,  3.63355539e-01,  4.53382921e-01],
       ...,
       [-7.44405889e-01,  7.44405889e-01, -1.27422758e+00, ...,
         3.24629502e-01, -7.22971264e-01, -6.00192629e-01],
       [-7.44405889e-01,  7.44405889e-01,  7.84789169e-01, ...,
        -2.21286504e-01, -9.04025732e-01, -6.00192629e-01],
       [ 1.34335316e+00, -1.34335316e+00,  7.84789169e-01, ...,
        -1.13228903e-01,  1.24660453e-03, -6.00192629e-01]])

## The Naive Bayes algorithm

### Building a sentiment analyzer using the Naive Bayes algorithm

#### 1) Importing the raw data into a DataFrame

In [11]:
import pandas as pd
data = pd.read_csv("amazon_cells_labelled.txt", sep='\t', header=None)
data.head()

handle: amazon_cells_labelled.txt
amazon_cells_labelled.txt


Unnamed: 0,0,1
0,So there is no way for me to plug it in here i...,0
1,"Good case, Excellent value.",1
2,Great for the jawbone.,1
3,Tied to charger for conversations lasting more...,0
4,The mic is great.,1


#### 2) Separating the columns that contain text reviews and sentiment labels

In [12]:
X = data.iloc[:,0] # extract column with reviews
y = data.iloc[:,-1] # extract column with sentiments

#### 3) Pre-processing the data using CountVectorizer

In [13]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
X_vec.todense() # convert sparse matrix into dense

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

#### 4) Transforming  the word count matrix into a matrix with corresponding tf-idf values

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_vec)
X_tfidf = X_tfidf.todense()

#### 5) Using train_test_split of Sklearn to split data into training and testing sets

In [15]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.25, random_state = 0)

#### 6) Importing the MultinomialNaive Bayes class from and fitting the training data to the model

In [19]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)

MultinomialNB()

#### 7) Predicting results of Test set (X_test)

In [20]:
y_pred = clf.predict(X_test)

#### 8) Determining the performance of the model

In [21]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 87,  33],
       [ 20, 110]], dtype=int64)

## The SVM algorithm

### Building a sentiment analyzer using SVM

#### 1) Importing the raw data into a DataFrame

In [22]:
import pandas as pd
data = pd.read_csv("amazon_cells_labelled.txt", sep='\t', header=None)
X = data.iloc[:,0] # extract column with review
y = data.iloc[:,-1] # extract column with sentiment

handle: amazon_cells_labelled.txt
amazon_cells_labelled.txt


#### 2) Performing necessary data preprocessing (same as above upto step 4)

In [23]:
# tokenize the news text and convert data in matrix format
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_vec = vectorizer.fit_transform(X)
X_vec = X_vec.todense() # convert sparse matrix into dense matrix
# Transform data by applying term frequency inverse document frequency (TF-IDF)
from sklearn.feature_extraction.text import TfidfTransformer
tfidf = TfidfTransformer()
X_tfidf = tfidf.fit_transform(X_vec)
X_tfidf = X_tfidf.todense()

#### 3) Using train_test_split of Sklearn to split data into training and testing sets

In [24]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size = 0.25, random_state = 0)

#### 4) Importing the SVM class and fitting the training data into the module

In [25]:
from sklearn.svm import SVC
classifier = SVC(kernel='linear')
classifier.fit(X_train, y_train)

SVC(kernel='linear')

#### 5) Predicting results of Test set (X_test)

In [26]:
y_pred = classifier.predict(X_test)

#### 7) Determining the performance of the model

In [27]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[102,  18],
       [ 33,  97]], dtype=int64)

## Productionizing a trained sentiment analyzer

#### Saving the trained classifier model and the feature matrix created as part of the training process in the local machine

In [28]:
import pickle
pickle.dump(vectorizer, open("vectorizer_sa", 'wb')) # Save vectorizer for reuse
pickle.dump(classifier, open("nb_sa", 'wb')) # Save classifier for reuse

#### Creating function to predict sentiment of new review

In [30]:
def sentiment_pred(classifier, training_matrix, doc):
    """function to predict the sentiment of a product review
    classifier : pre trained model
    training_matrix : matrix of features associated with the trained
    model
    doc = product review whose sentiment needs to be identified"""
    
    X_new = training_matrix.transform(pd.Series(doc))
    #don't use fit_transform here because the model is already fitted
    X_new = X_new.todense() #convert sparse matrix to dense
    
    from sklearn.feature_extraction.text import TfidfTransformer
    tfidf = TfidfTransformer()
    X_tfidf_new = tfidf.fit_transform(X_new)
    X_tfidf_new = X_tfidf_new.todense()
    
    y_new = classifier.predict(X_tfidf_new)
    if y_new[0] == 0:
        return "negative sentiment"
    elif y_new[0] == 1:
        return "positive sentiment"

#### Unpickling objects by using the load() function and checking sentiment of a ficticuous review

In [31]:
#unplickling the objects
nb_clf = pickle.load(open("nb_sa", 'rb'))
vectorizer = pickle.load(open("vectorizer_sa", 'rb'))

#passing the review into the fucntion to check its sentiment
new_doc = "The gadget works like a charm. Very satisfied with the product"
sentiment_pred(nb_clf, vectorizer, new_doc)

'positive sentiment'

In [32]:
#trying with different review
new_doc = "Not even close to the quality one would expect"
sentiment_pred(nb_clf, vectorizer, new_doc)

'negative sentiment'