# Choice of ML models For Text Analysis

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem import SnowballStemmer
from nltk.tokenize import TweetTokenizer

from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, roc_auc_score
from sklearn.metrics import f1_score, make_scorer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, auc
from sklearn.svm import SVC

In [2]:
data = pd.read_csv('../../../Data/Scrapping/Reddit/Twitter_Data.csv')
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


# Algorithm 1: Naive Bayes

## Clean Twitter Data

In [3]:
#data.replace([np.inf, -np.inf], np.nan, inplace=True)

In [4]:
def preprocess_data(data):  
    # Convert text to lowercase
    data['clean_text'] = data['clean_text'].str.strip().str.lower()
    return data

In [5]:
import re    # RegEx for removing non-letter characters

import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
from nltk.stem.porter import *


def tweet_to_words(tweet):
    ''' Convert tweet text into a sequence of words '''
    
    # convert to lowercase
    text = tweet.lower()
    # remove non letters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    # tokenize
    words = text.split()
    # remove stopwords
    words = [w for w in words if w not in stopwords.words("english")]
    # apply stemming
    words = [PorterStemmer().stem(w) for w in words]
    # return list
    return words

print("\nOriginal tweet ->", data['clean_text'][0])
print("\nProcessed tweet ->", tweet_to_words(data['clean_text'][0]))


Original tweet -> when modi promised “minimum government maximum governance” expected him begin the difficult job reforming the state why does take years get justice state should and not business and should exit psus and temples

Processed tweet -> ['modi', 'promis', 'minimum', 'govern', 'maximum', 'govern', 'expect', 'begin', 'difficult', 'job', 'reform', 'state', 'take', 'year', 'get', 'justic', 'state', 'busi', 'exit', 'psu', 'templ']


[nltk_data] Downloading package stopwords to /Users/sonia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
data = preprocess_data(data)

In [7]:
data = data.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)

In [8]:
data['category_uint8'] = data['category'].astype(np.uint8, errors = 'ignore')

In [9]:
data['category'] = data['category'].astype('int64')

In [10]:
data.head()

Unnamed: 0,clean_text,category,category_uint8
0,when modi promised “minimum government maximum...,-1,255
1,talk all the nonsense and continue all the dra...,0,0
2,what did just say vote for modi welcome bjp t...,1,1
3,asking his supporters prefix chowkidar their n...,1,1
4,answer who among these the most powerful world...,1,1


## Split into training and testing data

In [11]:
x_nb = data['clean_text']
y_nb = data['category']
x_nb_train, x_nb_test, y_nb_train, y_nb_test = train_test_split(x_nb, y_nb, stratify=y_nb, test_size=0.25, random_state=42)

## Vectorize text reviews to numbers

In [12]:
vec = CountVectorizer(stop_words='english', ngram_range = (1,5), max_features = 5000)
x_nb_train_vec = vec.fit_transform(x_nb_train).toarray()
x_nb_test_vec = vec.transform(x_nb_test).toarray()

## Generate Model  

In [13]:
model = MultinomialNB()
model.fit(x_nb_train_vec, y_nb_train)

MultinomialNB()

In [14]:
y_pred = model.predict(x_nb_test_vec)
acc = accuracy_score(y_nb_test, y_pred)
f1 = f1_score(y_nb_test, y_pred, average='micro')
prec = precision_score(y_nb_test, y_pred, average='micro')
rec = recall_score(y_nb_test, y_pred, average='micro')

print('Accuracy: ', round(100 * acc, 2),'%',
      '\nPrecision: ', round(100 * prec, 2),'%',
      '\nRecall: ', round(100 * rec, 2),'%',
      '\nF1 Score: ', round(f1*100, 2),'%')

Accuracy:  73.53 % 
Precision:  73.53 % 
Recall:  73.53 % 
F1 Score:  73.53 %


In [15]:
#SVMs

In [16]:
train_svm, test_svm = train_test_split(data, test_size=0.2, random_state=1)
x_train_svm = train_svm['clean_text'].values
x_test_svm = test_svm['clean_text'].values
y_train_svm = train_svm['category']
y_test_svm = test_svm['category']

In [17]:
def tokenize(text): 
    tknzr = TweetTokenizer()
    return tknzr.tokenize(text)

def stem(doc):
    return (stemmer.stem(w) for w in analyzer(doc))

en_stopwords = set(stopwords.words("english")) 

vectorizer = CountVectorizer(
    analyzer = 'word',
    tokenizer = tokenize,
    lowercase = True,
    ngram_range=(1, 1),
    stop_words = en_stopwords)

In [18]:
kfolds = StratifiedKFold(n_splits=5, shuffle=True, random_state=1)

In [None]:
np.random.seed(1)

pipeline_svm = make_pipeline(vectorizer, 
                            SVC(probability=True, kernel="linear", class_weight="balanced"))

grid_svm = GridSearchCV(pipeline_svm,
                    param_grid = {'svc__C': [0.01, 0.1, 1]}, 
                    cv = kfolds,
                    scoring="roc_auc",
                    verbose=1,   
                    n_jobs=-1) 

grid_svm.fit(x_train_svm, y_train_svm)
grid_svm.score(x_test_svm, y_test_svm)

In [None]:
y_pred_svm_proba = model.predict_proba(x_test_svm)[:, 1]
y_pred_svm = model.predict(x_test_svm)

auc = roc_auc_score(y_test_svm, y_pred_svm_proba)
acc = accuracy_score(y_test_svm, y_pred_svm)
f1 = f1_score(y_test_svm, y_pred_svm)
prec = precision_score(y_test_svm, y_pred_svm)
rec = recall_score(y_test_svm, y_pred_svm)

print('Accuracy: ', round(100 * acc, 2),'%',
      '\nPrecision: ', round(100 * prec, 2),'%',
      '\nRecall: ', round(100 * rec, 2),'%',
      '\nF1 Score: ', round(f1*100, 2),'%')

In [None]:
# Evaluation

from sklearn.metrics import accuracy_score, precision_score, recall_score, plot_confusion_matrix
from sklearn.metrics import f1_score

y_pred = svm.predict(X_test)
acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
prec = precision_score(y_test, y_pred)
rec = recall_score(y_test, y_pred)

acc, f1, prec

In [20]:
#Logistic regression

In [21]:
x_lr = data['clean_text']
y_lr =data['category']

x_train_lr, x_test_lr, y_train_lr, y_test_lr = train_test_split(x_lr, y_lr, test_size=0.2, random_state=42)

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

vect = CountVectorizer(min_df=2, ngram_range=(1, 1))
x_train_lr = vect.fit(x_train_lr).transform(x_train_lr) 
x_test_lr = vect.transform(x_test_lr)

In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

c_val = [0.75, 1, 2, 3, 4, 5, 10]

for c in c_val:
    logreg = LogisticRegression(C=c)
    logreg.fit(x_train_lr, y_train_lr)
    print ("Accuracy for C=%s: %s" % (c, accuracy_score(y_test_lr, logreg.predict(x_test_lr))))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=0.75: 0.9445296680370621


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=1: 0.9443762655703504


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=2: 0.944345585077008


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=3: 0.9454194023439897


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=4: 0.9446523900104313


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Accuracy for C=5: 0.9433944897833957
Accuracy for C=10: 0.9422593115297294


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [25]:
y_pred_lr = clf.predict(x_test_lr)
acc_lr = accuracy_score(y_test_lr, y_pred_lr)
f1 = f1_score(y_test_lr, y_pred_lr, average = 'micro')
prec = precision_score(y_test_lr, y_pred_lr, average = 'micro)
rec = recall_score(y_test_lr, y_pred_lr, average = 'micro)

print('Accuracy: ', round(100 * acc, 2),'%',
      '\nPrecision: ', round(100 * prec, 2),'%',
      '\nRecall: ', round(100 * rec, 2),'%',
      '\nF1 Score: ', round(f1*100, 2),'%')

SyntaxError: EOL while scanning string literal (<ipython-input-25-b2e005ac2c36>, line 4)

# Vader

In [None]:
import nltk
nltk.download('vader_lexicon')

from nltk.sentiment.vader import SentimentIntensityAnalyzer
sia = SentimentIntensityAnalyzer()

# Shuffle data, not really necesary, just for healthy practice
df_slice = data.sample(frac=1.0).reset_index(drop=True)

# Create prediction column based on Polarity Score
# -1: Negative Compound Scores
# 1: Positive Compound Scores
df_slice['Prediction'] = df_slice['clean_text'].apply(lambda x: 1 if sia.polarity_scores(x)['compound'] >= 0.5 else(-1 if sia.polarity_scores(x)['compound'] <= -0.5  else 0))

# Edit category column: 1 for positive, -1 or Negative , 0 for Neutral
#df_slice['category'] = df_slice['category'].apply(lambda x: -1 if x == "Negative" else(1 if x == "Positive" else 0))

# Check if Category and prediction column match for accuracy calculation
df_slice['Accuracy'] = df_slice.apply(lambda x: 1 if x[1] == x[2] else 0, axis = 1)

# Create confusion matrix
def conf_matrix(x):
    if x[1] == 1 and x[2] == 1:
        return 'TP'
    elif x[1] == 1 and x[2] == -1:
        return 'FN'
    elif x[1] == -1 and x[2] == 1:
        return 'FP'
    elif x[1] == -1 and x[2] == -1:
        return 'TN'
    else:
        return 0

df_slice['Conf_Matrix'] = df_slice.apply(lambda x: conf_matrix(x), axis = 1)
df_slice.tail(10)


conf_vals = df_slice.Conf_Matrix.value_counts().to_dict()
print(conf_vals)

accuracy = (conf_vals['TP'] + conf_vals['TN']) / (conf_vals['TP'] + conf_vals['TN'] + conf_vals['FP'] + conf_vals['FN'])
precision = conf_vals['TP'] / (conf_vals['TP'] + conf_vals['FP'])
recall = conf_vals['TP'] / (conf_vals['TP'] + conf_vals['FN'])
score = f1_score(precision, recall)
print('Accuracy: ', round(100 * accuracy, 2),'%',
      '\nPrecision: ', round(100 * precision, 2),'%',
      '\nRecall: ', round(100 * recall, 2),'%',
      '\nF1 Score: ', round(score, 2),'%')

# LSTM

In [None]:
# drop missing rows
df.dropna(axis=0, inplace=True)

# Map tweet categories
df['category'] = df['category'].map({-1.0:'Negative', 0.0:'Neutral', 1.0:'Positive'})
# Output first five rows
df.head()

In [None]:
# Apply data processing to each tweet
X = list(map(tweet_to_words, df['clean_text']))

max_words = 5000
max_len=50

def tokenize_pad_sequences(text):
    '''
    This function tokenize the input text into sequnences of intergers and then
    pad each sequence to the same length
    '''
    # Text tokenization
    tokenizer = Tokenizer(num_words=max_words, lower=True, split=' ')
    tokenizer.fit_on_texts(text)
    # Transforms text to a sequence of integers
    X = tokenizer.texts_to_sequences(text)
    # Pad sequences to the same length
    X = pad_sequences(X, padding='post', maxlen=max_len)
    # return sequences
    return X, tokenizer

print('Before Tokenization & Padding \n', df['clean_text'][0])
X, tokenizer = tokenize_pad_sequences(df['clean_text'])
print('After Tokenization & Padding \n', X[0])

In [None]:
# Convert categorical variable into dummy/indicator variables.
y = pd.get_dummies(df['category'])
# Train and Test split
X_train, X_test,y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 42)
# Extracting validation set from the train set
valid_size=1000
X_valid, y_valid = X_train[-valid_size:], y_train[-valid_size:]
X_test, y_test = X_train[:-valid_size], y_train[:-valid_size]

print('Train Set ->', X_train.shape, y_train.shape)
print('Validation Set ->', X_valid.shape, y_valid.shape)
print('Test Set ->', X_test.shape, y_test.shape)

In [None]:
import keras.backend as K

def f1_score(precision, recall):
    ''' Function to calculate f1 score '''
    
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val*100

In [None]:
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Bidirectional, LSTM, Dense, Dropout
from keras.metrics import Precision, Recall

vocab_size = 5000
embedding_size = 32

# Build model
model3 = Sequential()
model3.add(Embedding(vocab_size, embedding_size, input_length=max_len))
model3.add(Conv1D(filters=32, kernel_size=3, padding='same', activation='relu'))
model3.add(MaxPooling1D(pool_size=2))
model3.add(Bidirectional(LSTM(32)))
model3.add(Dropout(0.4))
model3.add(Dense(3, activation='softmax'))

print(model3.summary())

# Compile model
model3.compile(loss='categorical_crossentropy', optimizer='adam', 
               metrics=['accuracy', Precision(), Recall()])

# Train model
num_epochs = 1
batch_size = 32
history3 = model3.fit(X_train, y_train,
                      validation_data=(X_valid, y_valid),
                      batch_size=batch_size, epochs=num_epochs)

# Evaluate model on the test set
loss, accuracy, precision, recall = model3.evaluate(X_test, y_test, verbose=0)
# Print metrics
print('')
print('CNN + LSTM Accuracy  : {:.2f}'.format(100 * accuracy), '%')
print('CNN + LSTM Precision : {:.2f}'.format(100 * precision), '%')
print('CNN + LSTM Recall    : {:.2f}'.format(100 * recall), '%')
print('CNN + LSTM F1 Score  : {:.2f}'.format(f1_score(precision, recall)), '%')