In [2]:
from google.colab import files
import io

uploaded = files.upload()

Saving twitter_training.csv to twitter_training.csv
Saving twitter_validation.csv to twitter_validation.csv


In [4]:
!pip install keras_preprocessing

Collecting keras_preprocessing
  Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl.metadata (1.9 kB)
Downloading Keras_Preprocessing-1.1.2-py2.py3-none-any.whl (42 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/42.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.6/42.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: keras_preprocessing
Successfully installed keras_preprocessing-1.1.2


In [5]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_text as tf_text
from sklearn.linear_model import LogisticRegression
import keras
from keras_preprocessing.sequence import pad_sequences

In [482]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


# Twitter sentiment analysis

### Project 1a: simple sentiment analysis wih trimmme data and logitic regression
predicting sentiment based on tweet

Data structure:
- tokenise tweets
  - keras tokenization deprecated
  - tensorflow tokenization ✅
    - input pipeline:
      - pandas column 'tweets' - mixed length sentences
      - tf.data.Dataset.from_tensor_slices (tensorflow data API - multiple elements)
      - Tensors immutable, byte strings
      - convert to iterable to access data - iter() or .as_numpy_iterator() or .tolist()

    - Tokenize:
      - white space tokenizer:
        - returns space seperated strings
      - wordpiece tokenizer:
        - After the string is split into tokens, the WordpieceTokenizer can be used to split into subtokens.
      - Bert tokenizer - wordpiece + additional tasks:
        - implement lookup table
        - returns numerical tokens:
          - some tokens have multiple values!

      - When tokenizing strings, it is often desired to know where in the original string the token originated from. For this reason, each tokenizer which implements TokenizerWithOffsets has a tokenize_with_offsets method that will return the byte offsets along with the tokens. The start_offsets lists the bytes in the original string each token starts at, and the end_offsets lists the bytes immediately after the point where each token ends. To refrase, the start offsets are inclusive and the end offsets are exclusive.




- sentiment - multiclass:
    - positive = 2
    - neutral = 1
    - negative = 0

### Project 2: aspect-based sentiment analysis
predicting sentiment based on entity and tweets


# DATA

In [7]:
df = pd.read_csv(io.BytesIO(uploaded['twitter_training.csv']))
df.head()

Unnamed: 0,id,entity,sentiment,tweet
0,2401,Borderlands,Positive,im getting on borderlands and i will murder yo...
1,2402,Borderlands,Positive,So I spent a few hours making something for fu...
2,2403,Borderlands,Neutral,"Rock-Hard La Varlope, RARE & POWERFUL, HANDSOM..."
3,2404,Borderlands,Positive,that was the first borderlands session in a lo...
4,2405,Borderlands,Negative,the biggest dissappoinment in my life came out...


In [492]:
df_test = pd.read_csv(io.BytesIO(uploaded['twitter_validation.csv']))
df_test.head()

Unnamed: 0,id,entity,sentiment,tweet
0,3364,Facebook,Irrelevant,I mentioned on Facebook that I was struggling ...
1,352,Amazon,Neutral,BBC News - Amazon boss Jeff Bezos rejects clai...
2,8312,Microsoft,Negative,@Microsoft Why do I pay for WORD when it funct...
3,4371,CS-GO,Negative,"CSGO matchmaking is so full of closet hacking,..."
4,4433,Google,Neutral,Now the President is slapping Americans in the...


In [520]:
df_test.isnull().any()

Unnamed: 0,0
id,False
entity,False
sentiment,False
tweet,False


In [522]:
def prep_data(dataframe, trim):

  data = dataframe['tweet'].str.replace(r'http\S+|www\S+|\d+|[^A-Za-z\s#@]', '', regex=True)

  if trim == 1:
    data = data[data.str.split().str.len() >= 10]
    label = df['sentiment'][data.index].replace(['Positive','Neutral','Irrelevant', 'Negative'], [3,2,1,0])
    return data,label

  else:
    pass




In [523]:
data,label = prep_data(df,trim=True)
data_test, label_test = prep_data(df_test,trim=True)

  label = df['sentiment'][data.index].replace(['Positive','Neutral','Irrelevant', 'Negative'], [3,2,1,0])
  label = df['sentiment'][data.index].replace(['Positive','Neutral','Irrelevant', 'Negative'], [3,2,1,0])


### Project 1: simple sentiment analysis
predicting sentiment based on tweet

Data structure:
- tokenise tweets
- sentiment:
    - positive = 3
    - neutral = 2
    - irrelevant = 1
    - negative = 0

In [84]:
max_len = data.apply(len).max()
data['padded_text'] = data.apply(lambda x: x + ' o' * (max_len - len(x)))

# TESTING
examining data structure of tf.Tensor from Dataset structure documentation

In [144]:
dataset1 = tf.data.Dataset.from_tensor_slices(tf.random.uniform([4, 10]))
dataset1

<_TensorSliceDataset element_spec=TensorSpec(shape=(10,), dtype=tf.float32, name=None)>

In [145]:
tf.random.uniform([4, 10])

<tf.Tensor: shape=(4, 10), dtype=float32, numpy=
array([[0.8487389 , 0.7213385 , 0.06014907, 0.40562618, 0.6354599 ,
        0.4310453 , 0.93249714, 0.6720426 , 0.9995816 , 0.273103  ],
       [0.1545012 , 0.4828093 , 0.24877155, 0.36434257, 0.48571467,
        0.6608845 , 0.7454139 , 0.7145097 , 0.81512475, 0.03247488],
       [0.59880674, 0.05708671, 0.97759616, 0.31378686, 0.70346916,
        0.9856764 , 0.17099094, 0.07033682, 0.14623415, 0.15333664],
       [0.31654012, 0.8497118 , 0.78671885, 0.4074607 , 0.4659654 ,
        0.8509089 , 0.17735934, 0.6656431 , 0.28717208, 0.13896632]],
      dtype=float32)>

In [146]:
next(iter(dataset1))

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([0.42339242, 0.6744585 , 0.5251161 , 0.18562543, 0.32478   ,
       0.372859  , 0.9612279 , 0.79048645, 0.98815143, 0.5291933 ],
      dtype=float32)>

In [9]:
tf.convert_to_tensor(data.to_numpy())

<tf.Tensor: shape=(5,), dtype=string, numpy=
array([b'im getting on borderlands and i will murder you all ,',
       b"So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg",
       b'Rock-Hard La Varlope, RARE & POWERFUL, HANDSOME JACKPOT, Borderlands 3 (Xbox) dlvr.it/RMTrgF  ',
       b'that was the first borderlands session in a long time where i actually had a really satisfying combat experience. i got some really good kills',
       b'the biggest dissappoinment in my life came out a year ago fuck borderlands 3'],
      dtype=object)>

# FUNCTIONS

In [58]:
def tokeniser(doc, tokenizer, output):

  tokenized_docs = doc.map(lambda x: tokenizer.tokenize(x))

  if output == 'ls':
    return [tokens for tokens in tokenized_docs]

  if output == 'np':
    return np.array(tokenized_docs)

  if output == 'tf':
    return tokenized_docs


In [138]:
 # TO USE LIST FOR TF TOKENIZATION
 list_data = [[d] for d in data]
 docs = tf.data.Dataset.from_tensor_slices(list_data)

In [353]:
# TO USE NUMPY ARRAY FOR TF TOKENIZATION
tensor_data = tf.convert_to_tensor(data.to_numpy())
docs = tf.data.Dataset.from_tensor_slices(tensor_data)


In [354]:
np.array(next(docs.as_numpy_iterator()))

array(b'im getting on borderlands and i will murder you all ',
      dtype='|S52')

In [11]:
# AMBIGUOUS SHAPE
docs.element_spec

TensorSpec(shape=(), dtype=tf.string, name=None)

In [157]:
# TO INSPECT DOCS
next(iter(docs))

<tf.Tensor: shape=(), dtype=string, numpy=b'im getting on borderlands and i will murder you all ,'>

In [110]:
def tensor_to_mutable(tensor):
  data = []

  for dat in tensor:
    data.append(dat.numpy())

  return data

In [300]:
data = df['tweet'][0:5]
data = data.apply(lambda x: [x]).tolist()

trimmed_tokens = trimmer(data)
trimmed_tokens[0:5]

[<tf.RaggedTensor [[b'im getting on borderlands and i will murder you all ,'],
  [b"So I spent a few hours making something for fun. . . If you don't know I am a HUGE @Borderlands fan and Maya is one of my favorite characters. So I decided to make myself a wallpaper for my PC. . Here is the original image versus the creation I made :) Enjoy! pic.twitter.com/mLsI5wf9Jg"],
  [b'Rock-Hard La Varlope, RARE & POWERFUL, HANDSOME JACKPOT, Borderlands 3 (Xbox) dlvr.it/RMTrgF  '],
  [b'that was the first borderlands session in a long time where i actually had a really satisfying combat experience. i got some really good kills'],
  [b'the biggest dissappoinment in my life came out a year ago fuck borderlands 3']]>]

In [323]:
def list_trimmer(ls):

    min_len = min(len(sentence) for sentence in ls)

    trimmed_list = [sentence[:min_len] for sentence in ls]

    return [[' '.join(sentence)] for sentence in trimmed_list]

In [None]:
new_list = list_trimmer(ws_data)
new_list

[['im getting on borderlands and i will murder you all ,'],
 ['So I spent a few hours making something for fun. .'],
 ['Rock-Hard La Varlope, RARE & POWERFUL, HANDSOME JACKPOT, Borderlands 3 (Xbox)'],
 ['that was the first borderlands session in a long time where'],
 ['the biggest dissappoinment in my life came out a year ago']]

In [None]:
# may use for padding
for data in tokenised_data1.take(0):
  dat = data
  for i in range((maxx - data.shape[0])):
    dat = tf.concat([dat, tf.constant([b'o'], dtype=tf.string)], axis=0)
  test.append(dat)

In [None]:
 # seems that a list of lists still works but will use tensor
 list_data = [[d] for d in data]
 docs = tf.data.Dataset.from_tensor_slices(list_data)



In [None]:
bert_tokenizer = tf_text.BertTokenizer(lookup_table)
tokenised_data_bert = tokeniser(docs, bert_tokenizer).as_numpy_iterator()

In [None]:
for arr in tokenised_data_bert:
  print(arr.shape)

(1, 11, 1)
(1, 12, 1)
(1, 18, 1)
(1, 11, 1)
(1, 11, 1)


In [None]:
x = []

for dat in tokenised_data_bert.tolist():

  datapoint = []
  for num in dat.numpy()[0]:
    datapoint.append(num[0])

  x.append(np.array(datapoint))


In [None]:
x = []

for dat in tokenised_data_bert.tolist():
  x.append(dat.numpy())

In [None]:
x = []

for dat in tokenised_data_bert.tolist():
  numpy_data = dat.numpy()

  datapoint = []

  if numpy_data.ndim == 3:
    datapoint =numpy_data.flatten()



  if numpy_data.ndim == 1:
    for i,e in enumerate(numpy_data):
      for j,f in enumerate(e):
        datapoint.append(f[0])
    datapoint = np.array(datapoint)
  x.append(datapoint)




# SENTIMENT ANALYSIS WITH WHITESPACE TOKENIZED DATA

In [151]:
# I have used this instead of whitespace tokenizer as it returns more useable data structure
def whitespace_converter(input):

  nump = input.to_numpy().astype(str)
  return np.char.split(nump)

In [507]:
ws_data = whitespace_converter(data_trim)
ws_test = whitespace_converter(data_test_trim)
ws_data[0:5]

array([list(['im', 'getting', 'on', 'borderlands', 'and', 'i', 'will', 'murder', 'you', 'all']),
       list(['So', 'I', 'spent', 'a', 'few', 'hours', 'making', 'something', 'for', 'fun', 'If', 'you', 'dont', 'know', 'I', 'am', 'a', 'HUGE', '@Borderlands', 'fan', 'and', 'Maya', 'is', 'one', 'of', 'my', 'favorite', 'characters', 'So', 'I', 'decided', 'to', 'make', 'myself', 'a', 'wallpaper', 'for', 'my', 'PC', 'Here', 'is', 'the', 'original', 'image', 'versus', 'the', 'creation', 'I', 'made', 'Enjoy', 'pictwittercommLsIwfJg']),
       list(['RockHard', 'La', 'Varlope', 'RARE', 'POWERFUL', 'HANDSOME', 'JACKPOT', 'Borderlands', 'Xbox', 'dlvritRMTrgF']),
       list(['that', 'was', 'the', 'first', 'borderlands', 'session', 'in', 'a', 'long', 'time', 'where', 'i', 'actually', 'had', 'a', 'really', 'satisfying', 'combat', 'experience', 'i', 'got', 'some', 'really', 'good', 'kills']),
       list(['the', 'biggest', 'dissappoinment', 'in', 'my', 'life', 'came', 'out', 'a', 'year', 'ago', 'fuck

In [371]:
#ws_tokenizer = tf_text.WhitespaceTokenizer()
#ws_toks = tokeniser(docs, ws_tokenizer)
#ws_toks

In [429]:
def trimmer(data):

    min_len = min(len(ls) for ls in data)
    ragged_tensor = tf.ragged.constant(data)
    trimmerr = tf_text.RoundRobinTrimmer(max_seq_length=min_len)
    trimmed = trimmerr.trim([ragged_tensor])

    return trimmed

In [508]:
ws_trimmed_tokens = trimmer(ws_data)
ws_trimmed_tokens_test = trimmer(ws_test)
ws_trimmed_tokens[0:5]

[<tf.RaggedTensor [[b'im', b'getting', b'on', ..., b'murder', b'you', b'all'],
  [b'So', b'I', b'spent', ..., b'something', b'for', b'fun'],
  [b'RockHard', b'La', b'Varlope', ..., b'Borderlands', b'Xbox',
   b'dlvritRMTrgF'],
  ...,
  [b'Nvidia', b'doesnt', b'want', ..., b'crypto', b'craze', b'docs'],
  [b'Let', b'no', b'elim', ..., b'automatically', b'records', b'your'],
  [b'Just', b'realized', b'the', ..., b'Mac', b'is', b'like']]>]

In [431]:
_VOCAB = []

for data in ws_trimmed_tokens[0]:
  for string in data:
    _VOCAB.append(string.numpy())

# SET FOR UNIQUE VOCAB
_VOCAB = list(set(_VOCAB))


In [509]:
_VOCAB = []

for data in ws_trimmed_tokens_test[0]:
  for string in data:
    _VOCAB.append(string.numpy())

# SET FOR UNIQUE VOCAB
_VOCAB = list(set(_VOCAB))

In [433]:
len(_VOCAB)

13802

In [510]:
lookup_table = tf.lookup.StaticVocabularyTable(
    tf.lookup.KeyValueTensorInitializer(
      keys=_VOCAB,
      key_dtype=tf.string,
      values=tf.range(
          tf.size(_VOCAB, out_type=tf.int64), dtype=tf.int64),
          value_dtype=tf.int64
        ),
      num_oov_buckets=1
)

In [511]:
reverse_lookup = tf.lookup.StaticHashTable(
    tf.lookup.KeyValueTensorInitializer(
        keys=tf.range(len(_VOCAB), dtype=tf.int64),
        values=tf.constant(_VOCAB, dtype=tf.string),
        key_dtype=tf.int64,
        value_dtype=tf.string
    ),
    default_value="<OOV>"
)

logistic regression input shape (n_samples, n_features)

In [436]:
ws_trimmed_tokens[0].shape

TensorShape([8309, None])

In [437]:
ws_trimmed_tokens[0].numpy().shape

(8309, 10)

In [512]:
numpy_tokens = lookup_table.lookup(ws_trimmed_tokens[0]).numpy()
numpy_tokens_test = lookup_table.lookup(ws_trimmed_tokens_test[0]).numpy()
numpy_tokens[0:5]

array([[2918, 1057, 2495, 2918, 1955, 2371, 1466, 2918, 2188, 1878],
       [   8, 1934,  156,  986, 2918, 2367,   34, 1342, 1297, 2159],
       [2918, 2918, 2918, 2918, 2918, 2918, 2918, 2020,  879, 2918],
       [ 881, 1177,  275, 1785, 2918, 2918,  787,  986, 2131,  605],
       [ 275,  830, 2918,  787, 2604, 2294, 2274,  548,  986, 1253]])

In [440]:
reverse_lookup.lookup(lookup_table.lookup(ws_trimmed_tokens[0])).numpy()

array([[b'im', b'getting', b'on', ..., b'murder', b'you', b'all'],
       [b'So', b'I', b'spent', ..., b'something', b'for', b'fun'],
       [b'RockHard', b'La', b'Varlope', ..., b'Borderlands', b'Xbox',
        b'dlvritRMTrgF'],
       ...,
       [b'Nvidia', b'doesnt', b'want', ..., b'crypto', b'craze', b'docs'],
       [b'Let', b'no', b'elim', ..., b'automatically', b'records',
        b'your'],
       [b'Just', b'realized', b'the', ..., b'Mac', b'is', b'like']],
      dtype=object)

In [513]:
x = numpy_tokens
x_test = numpy_tokens_test
x

array([[2918, 1057, 2495, ..., 2918, 2188, 1878],
       [   8, 1934,  156, ..., 1342, 1297, 2159],
       [2918, 2918, 2918, ..., 2020,  879, 2918],
       ...,
       [1727, 2101, 2376, ..., 2918, 2918, 2918],
       [ 831,  764, 2918, ..., 2918, 2918,  291],
       [1772, 2918,  275, ..., 2918, 1806, 2183]])

In [505]:
y = label.to_numpy()
y_test = label_test.to_numpy()
y

array([3, 3, 2, ..., 2, 3, 3])

In [474]:
clf = LogisticRegression(random_state=0,max_iter=200)
clf.fit(x, y)

In [514]:
y_pred = clf.predict(x_test)

In [516]:
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [518]:
print(f'Accuracy: {accuracy:.4f}')
print(f'Precision: {precision:.4f}')
print(f'Recall: {recall:.4f}')
print(f'F1 Score: {f1:.4f}')

Accuracy: 0.2596
Precision: 0.0674
Recall: 0.2596
F1 Score: 0.1070


In [524]:
for num_iter in range(100,1000,50):
  clf = LogisticRegression(max_iter=num_iter).fit(x, y)
  y_pred = clf.predict(x_test)
  print(f"results for {num_iter} training iterations")
  accuracy = accuracy_score(y_test, y_pred)
  precision = precision_score(y_test, y_pred, average='weighted')
  recall = recall_score(y_test, y_pred, average='weighted')
  f1 = f1_score(y_test, y_pred, average='weighted')
  print(f'Accuracy: {accuracy:.4f}')
  print(f'Precision: {precision:.4f}')
  print(f'Recall: {recall:.4f}')
  print(f'F1 Score: {f1:.4f}','\n')

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 100 training iterations
Accuracy: 0.2636
Precision: 0.2699
Recall: 0.2636
F1 Score: 0.1447 



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 150 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 

results for 200 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 250 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 300 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 350 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 400 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 450 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 500 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 550 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 600 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 650 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 700 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 750 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 800 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 850 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 900 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


results for 950 training iterations
Accuracy: 0.2715
Precision: 0.2639
Recall: 0.2715
F1 Score: 0.1634 



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
