In [1]:
import tensorflow as tf 
# Get the GPU device name.
device_name = tf.test.gpu_device_name()

# The device name should look like the following:
if device_name == '/device:GPU:0':
    print('Found GPU at: {}'.format(device_name))
else:
    raise SystemError('GPU device not found')
    
import torch

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

2022-04-16 10:00:00.247700: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-04-16 10:00:01.368478: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2022-04-16 10:00:01.370179: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-04-16 10:00:01.401296: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-16 10:00:01.401640: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:

Found GPU at: /device:GPU:0
There are 1 GPU(s) available.
We will use the GPU: NVIDIA GeForce GTX 1070


In [2]:
import pandas as pd 
import seaborn as sns
import numpy as np 
import matplotlib.pyplot as plt 

df_new = pd.read_csv('CSVFiles/new-tweets.csv')
df_new_added_old_negative = pd.read_csv('CSVFiles/new-tweets-added-old-negative.csv')
df_old = pd.read_csv('CSVFiles/old-tweets.csv')
df_old_added_neutral = pd.read_csv('CSVFiles/old-tweets-added-neutral.csv')

In [7]:
from transformers import BertTokenizer 

# load the BERT tokenizer 
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [8]:
# some regex to clean the data before passing it to BERT tokenizer
import re 
df_new = df_new.replace(to_replace='https?:\/\/\S+', value='', regex=True) # remove https urls 
df_new = df_new.replace(to_replace="www\.[a-z]?\.?(com)+|[a-z]+\.(com)", value='', regex=True) # remove www urls 
df_new = df_new.replace(to_replace="\[video\]", value='', regex=True) # remove videos that got replaced with [video]
df_new = df_new.replace(to_replace='{link}', value='', regex=True) # remove links 
df_new = df_new.replace(to_replace='&[a-z]+;', value='', regex=True) # remove HTML embedded characters
df_new = df_new.replace(to_replace='@[^ ]+', value='', regex=True) # remove @usernames 

df_new_added_old_negative = df_new_added_old_negative.replace(to_replace='https?:\/\/\S+', value='', regex=True) # remove https urls 
df_new_added_old_negative = df_new_added_old_negative.replace(to_replace="www\.[a-z]?\.?(com)+|[a-z]+\.(com)", value='', regex=True) # remove www urls 
df_new_added_old_negative = df_new_added_old_negative.replace(to_replace="\[video\]", value='', regex=True) # remove videos that got replaced with [video]
df_new_added_old_negative = df_new_added_old_negative.replace(to_replace='{link}', value='', regex=True) # remove links 
df_new_added_old_negative = df_new_added_old_negative.replace(to_replace='&[a-z]+;', value='', regex=True) # remove HTML embedded characters
df_new_added_old_negative = df_new_added_old_negative.replace(to_replace='@[^ ]+', value='', regex=True) # remove @usernames 

df_old = df_old.replace(to_replace='https?:\/\/\S+', value='', regex=True) # remove https urls 
df_old = df_old.replace(to_replace="www\.[a-z]?\.?(com)+|[a-z]+\.(com)", value='', regex=True) # remove www urls 
df_old = df_old.replace(to_replace="\[video\]", value='', regex=True) # remove videos that got replaced with [video]
df_old = df_old.replace(to_replace='{link}', value='', regex=True) # remove links 
df_old = df_old.replace(to_replace='&[a-z]+;', value='', regex=True) # remove HTML embedded characters
df_old = df_old.replace(to_replace='@[^ ]+', value='', regex=True) # remove @usernames 

df_old_added_neutral = df_old_added_neutral.replace(to_replace='https?:\/\/\S+', value='', regex=True) # remove https urls 
df_old_added_neutral = df_old_added_neutral.replace(to_replace="www\.[a-z]?\.?(com)+|[a-z]+\.(com)", value='', regex=True) # remove www urls 
df_old_added_neutral = df_old_added_neutral.replace(to_replace="\[video\]", value='', regex=True) # remove videos that got replaced with [video]
df_old_added_neutral = df_old_added_neutral.replace(to_replace='{link}', value='', regex=True) # remove links 
df_old_added_neutral = df_old_added_neutral.replace(to_replace='&[a-z]+;', value='', regex=True) # remove HTML embedded characters
df_old_added_neutral = df_old_added_neutral.replace(to_replace='@[^ ]+', value='', regex=True) # remove @usernames 

In [22]:
new_tweets = []  
new_labels = [] 

old_tweets = df_old.tweet.values
old_labels = df_old.sentiment.values 

new_added_negative_tweets = [] 
new_added_negative_labels = [] 

old_added_neutral_tweets = [] 
old_added_neutral_labels = [] 

In [23]:
for index, row in df_new.iterrows(): 
    new_tweets.append(row['tweet'])
    if row['sentiment'] == float(1.0):
        new_labels.append(1)
    elif row['sentiment'] == float(2.0):
        new_labels.append(2)
    else: 
        new_labels.append(0)

In [31]:
for index, row in df_new_added_old_negative.iterrows(): 
    new_added_negative_tweets.append(row['tweet'])
    if row['sentiment'] == float(1.0):
        new_added_negative_labels.append(1)
    elif row['sentiment'] == float(2.0):
        new_added_negative_labels.append(2)
    else: 
        new_added_negative_labels.append(0)

In [36]:
for index, row in df_old_added_neutral.iterrows(): 
    old_added_neutral_tweets.append(row['tweet'])
    if row['sentiment'] == float(1.0):
        old_added_neutral_labels.append(1)
    elif row['sentiment'] == float(2.0):
        old_added_neutral_labels.append(2)
    else: 
        old_added_neutral_labels.append(0)

In [40]:
# first we will run the new_tweets dataset 

input_ids = [] 
attention_masks = [] 

for i in range(len(new_tweets)):
    encoded_dict = tokenizer.encode_plus(
                        str(new_tweets[i]),                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'np',     # Return pytorch tensors.
                   )
    # add encoded sentence to list 
    input_ids.append(encoded_dict['input_ids'][0])
    # add attention masks to list 
    attention_masks.append(encoded_dict['attention_mask'][0])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [45]:
embeddings = []

for x in input_ids:
    embeddings.append(x)
    
# print(len(embeddings))

training_embeddings = embeddings[:45784]
testing_embeddings = embeddings[45785:]

training_labels = new_labels[:45784]
testing_labels = new_labels[45785:]

In [69]:
X = training_embeddings
Y = training_labels

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=101)
neigh.fit(X, Y)

KNeighborsClassifier(n_neighbors=101)

In [70]:
predictions = neigh.predict(testing_embeddings)

In [71]:
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score 
print('Precision: %.3f' % precision_score(testing_labels, predictions, average='micro'))
print('Recall: %.3f' % recall_score(testing_labels, predictions, average='micro'))
print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))
print('F1 Score: %.3f' % f1_score(testing_labels, predictions, average='micro'))

Precision: 0.666
Recall: 0.666
Accuracy: 0.666
F1 Score: 0.666


In [73]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=200).fit(X, Y)
clf.predict(testing_embeddings)
clf.score(testing_embeddings, testing_labels)

0.6699429919402399

In [75]:
from sklearn.ensemble import RandomForestClassifier 
for i in [100,200,300,400,500]:
    print("Number of estimators -> " + str(i))
    clf = RandomForestClassifier(max_depth = 5, random_state=0, n_estimators=i)
    clf.fit(X, Y)
    predictions = clf.predict(testing_embeddings)
    print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))

Number of estimators -> 100
Accuracy: 0.643
Number of estimators -> 200
Accuracy: 0.640
Number of estimators -> 300
Accuracy: 0.638
Number of estimators -> 400
Accuracy: 0.638
Number of estimators -> 500
Accuracy: 0.638


In [83]:
# now we will run old-tweets 
old_tweets = df_old.tweet.values 
old_labels = df_old.sentiment.values 

input_ids = [] 
attention_masks = [] 

for i in range(len(old_tweets)):
    encoded_dict = tokenizer.encode_plus(
                        str(old_tweets[i]),                      # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'np',     # Return pytorch tensors.
                   )
    # add encoded sentence to list 
    input_ids.append(encoded_dict['input_ids'][0])
    # add attention masks to list 
    attention_masks.append(encoded_dict['attention_mask'][0])



In [89]:
embeddings = []

for x in input_ids:
    embeddings.append(x)
    
# print(len(embeddings))

training_embeddings = embeddings[:53550]
testing_embeddings = embeddings[53551:]

training_labels = old_labels[:53550]
testing_labels = old_labels[53551:]

In [95]:
X = training_embeddings
Y = training_labels

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=101)
neigh.fit(X, Y)

predictions = neigh.predict(testing_embeddings)

print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))

Accuracy: 0.290


In [99]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=200).fit(X, Y)
clf.predict(testing_embeddings)
clf.score(testing_embeddings, testing_labels)

0.05648008068582955

In [101]:
from sklearn.ensemble import RandomForestClassifier 
for i in [100,200,300,400,500]:
    print("Number of estimators -> " + str(i))
    clf = RandomForestClassifier(max_depth = 5, random_state=0, n_estimators=i)
    clf.fit(X, Y)
    predictions = clf.predict(testing_embeddings)
    print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))

Number of estimators -> 100
Accuracy: 0.002
Number of estimators -> 200
Accuracy: 0.002
Number of estimators -> 300
Accuracy: 0.002
Number of estimators -> 400
Accuracy: 0.002
Number of estimators -> 500
Accuracy: 0.002


In [104]:
# now we will run old-tweets-added-neutral 
input_ids = [] 
attention_masks = [] 

for i in range(len(old_added_neutral_tweets)):
    encoded_dict = tokenizer.encode_plus(
                        str(old_added_neutral_tweets[i]), # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'np',     # Return pytorch tensors.
                   )
    # add encoded sentence to list 
    input_ids.append(encoded_dict['input_ids'][0])
    # add attention masks to list 
    attention_masks.append(encoded_dict['attention_mask'][0])



In [109]:
embeddings = []

for x in input_ids:
    embeddings.append(x)
    
# print(len(embeddings))

training_embeddings = embeddings[:73306]
testing_embeddings = embeddings[73307:]

training_labels = old_added_neutral_labels[:73306]
testing_labels = old_added_neutral_labels[73307:]

In [116]:
X = training_embeddings
Y = training_labels

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, Y)

predictions = neigh.predict(testing_embeddings)

print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))

Accuracy: 0.637


In [117]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=200).fit(X, Y)
clf.predict(testing_embeddings)
clf.score(testing_embeddings, testing_labels)

0.5082872928176796

In [119]:
from sklearn.ensemble import RandomForestClassifier 
for i in [100,200,300,400,500]:
    print("Number of estimators -> " + str(i))
    clf = RandomForestClassifier(max_depth = 5, random_state=0, n_estimators=i)
    clf.fit(X, Y)
    predictions = clf.predict(testing_embeddings)
    print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))

Number of estimators -> 1000
Accuracy: 0.462


In [122]:
# finally, we will now use new-tweets-added-old-negative
input_ids = [] 
attention_masks = [] 

for i in range(len(new_added_negative_tweets)):
    encoded_dict = tokenizer.encode_plus(
                        str(new_added_negative_tweets[i]), # Sentence to encode.
                        add_special_tokens = True, # Add '[CLS]' and '[SEP]'
                        max_length = 512,           # Pad & truncate all sentences.
                        pad_to_max_length = True,
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'np',     # Return pytorch tensors.
                   )
    # add encoded sentence to list 
    input_ids.append(encoded_dict['input_ids'][0])
    # add attention masks to list 
    attention_masks.append(encoded_dict['attention_mask'][0])



In [135]:
embeddings = []

for x in input_ids:
    embeddings.append(x)
    
# print(len(embeddings))

training_embeddings = embeddings[:61084]
testing_embeddings = embeddings[61085:]

training_labels = new_added_negative_labels[:61084]
testing_labels = new_added_negative_labels[61085:]

In [141]:
X = training_embeddings
Y = training_labels

from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=101)
neigh.fit(X, Y)

predictions = neigh.predict(testing_embeddings)

print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))

Accuracy: 0.613


In [142]:
from sklearn.neural_network import MLPClassifier
clf = MLPClassifier(random_state=1, max_iter=200).fit(X, Y)
clf.predict(testing_embeddings)
clf.score(testing_embeddings, testing_labels)

0.5899513776337115

In [143]:
from sklearn.ensemble import RandomForestClassifier 
for i in [100,200,300,400,500]:
    print("Number of estimators -> " + str(i))
    clf = RandomForestClassifier(max_depth = 5, random_state=0, n_estimators=i)
    clf.fit(X, Y)
    predictions = clf.predict(testing_embeddings)
    print('Accuracy: %.3f' % accuracy_score(testing_labels, predictions))

Number of estimators -> 100
Accuracy: 0.562
Number of estimators -> 200
Accuracy: 0.559
Number of estimators -> 300
Accuracy: 0.566
Number of estimators -> 400
Accuracy: 0.564
Number of estimators -> 500
Accuracy: 0.584
