In [45]:
#for data pre-processing and visualization
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#pata nahi
import regex as re
import string

#for managing skew datset 
from imblearn.over_sampling import RandomOverSampler

# frame-works
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras import Model

# pretrained model from transformer lib
from transformers import RobertaTokenizerFast
from transformers import TFRobertaModel, TFAutoModelForSequenceClassification

# Metrics
from sklearn import preprocessing
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split

In [46]:
#set style for plots
sns.set_style('white')
sns.despine()
#plt.style.use('seaborn-whitegrid')
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

<Figure size 640x480 with 0 Axes>

In [47]:
#importing dataset into dataframe
df= pd.read_csv('../dataset/final.csv')
df.head()

Unnamed: 0,sentiment,tweet
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...
2,0.0,My cat only chews @apple cords. Such an #Apple...
3,0.0,I agree with @jimcramer that the #IndividualIn...
4,0.0,Nobody expects the Spanish Inquisition #AAPL


In [48]:
df.iloc[3].tweet

"I agree with @jimcramer that the #IndividualInvestor should own not trade #Apple #AAPL, it's extended so today's pullback is good to see"

### Deep Data Cleaning

In [49]:
# cleaning emoji's remaining
def strip_all_entities(text): 
    text = text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    text = re.sub(r"(?:\@|https?\://)\S+", "", text) #remove links and mentions
    text = re.sub(r'[^\x00-\x7f]',r'', text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    text = text.translate(table)
    return text

#clean hashtags at the end of the sentence, and keep those in the middle of the sentence by removing just the # symbol
def clean_hashtags(tweet):
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', tweet)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    return new_tweet2

#Filter special characters such as & and $ present in some words
def filter_chars(a):
    sent = []
    for word in a.split(' '):
        if ('$' in word) | ('&' in word):
            sent.append('')
        else:
            sent.append(word)
    return ' '.join(sent)

def remove_mult_spaces(text): # remove multiple spaces
    return re.sub("\s\s+" , " ", text)

In [50]:
texts_new = []
org_len=[]
for t in df.tweet:
    org_len.append(len(t.split()))
    texts_new.append(remove_mult_spaces(filter_chars(clean_hashtags(strip_all_entities(t)))))

In [60]:
df['org_len']=org_len
df['clean_text']=texts_new
df.head()

Unnamed: 0,sentiment,tweet,org_len,clean_text,new_len
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,7,aaplthe 10 best steve jobs emails ever,7
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,13,rt why aapl stock had a miniflash crash today ...,11
2,0.0,My cat only chews @apple cords. Such an #Apple...,9,my cat only chews cords such an applesnob,8
3,0.0,I agree with @jimcramer that the #IndividualIn...,22,i agree with that the individualinvestor shoul...,21
4,0.0,Nobody expects the Spanish Inquisition #AAPL,6,nobody expects the spanish inquisition aapl,6


In [61]:
new_len = []
for text in df.clean_text:
    tweet_len = len(text.split())
    new_len.append(tweet_len)

In [62]:
df['new_len']=new_len
df.head()
#df[['sentiment', 'clean_text']].to_csv('../dataset/data.csv', index=False)

Unnamed: 0,sentiment,tweet,org_len,clean_text,new_len
0,0.0,#AAPL:The 10 best Steve Jobs emails ever...htt...,7,aaplthe 10 best steve jobs emails ever,7
1,0.0,RT @JPDesloges: Why AAPL Stock Had a Mini-Flas...,13,rt why aapl stock had a miniflash crash today ...,11
2,0.0,My cat only chews @apple cords. Such an #Apple...,9,my cat only chews cords such an applesnob,8
3,0.0,I agree with @jimcramer that the #IndividualIn...,22,i agree with that the individualinvestor shoul...,21
4,0.0,Nobody expects the Spanish Inquisition #AAPL,6,nobody expects the spanish inquisition aapl,6


In [63]:
df.iloc[0].clean_text

'aaplthe 10 best steve jobs emails ever'

### Balancing the dataset

In [64]:
df['sentiment'].value_counts()

sentiment
 0.0    3676
-1.0    2235
 1.0     704
Name: count, dtype: int64

In [65]:
ros = RandomOverSampler()
train_x, train_y = ros.fit_resample(np.array(df['clean_text']).reshape(-1, 1), np.array(df['sentiment']).reshape(-1, 1))

# storing the balanced values into new dataframe
train_os = pd.DataFrame(list(zip([x[0] for x in train_x], train_y)), columns = ['clean_text', 'sentiment'])
train_os['sentiment'].value_counts()

sentiment
 0.0    3676
 1.0    3676
-1.0    3676
Name: count, dtype: int64

In [67]:
# Storing the non-empty clean_text into train_os 
train_os = train_os[train_os.clean_text != '']
#converting df to csv format
train_os.to_csv('../dataset/data.csv', index=None)

In [69]:
dataset=pd.read_csv("../dataset/data.csv")

### train-validation dataset split

In [74]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files="../dataset/data.csv")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['clean_text', 'sentiment'],
        num_rows: 11009
    })
})


### Tokenizing

In [76]:
from transformers import AutoTokenizer, AutoConfig
tokenizer= AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
#config = AutoConfig.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

In [79]:
def tokenize(examples):
    token = tokenizer(examples['clean_text'], truncation=True, padding=True)
    return token

dataset_encoded = dataset.map(tokenize)

Map:   0%|          | 0/11009 [00:00<?, ? examples/s]

Map: 100%|██████████| 11009/11009 [00:04<00:00, 2271.40 examples/s]


In [82]:
dataset_encoded

DatasetDict({
    train: Dataset({
        features: ['clean_text', 'sentiment', 'input_ids', 'attention_mask'],
        num_rows: 11009
    })
})

### Building Model

In [84]:
from transformers import TFAutoModelForSequenceClassification
model = TFAutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

tf_model.h5: 100%|██████████| 499M/499M [01:22<00:00, 6.02MB/s] 
2024-02-05 23:50:59.227302: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154414080 exceeds 10% of free system memory.
2024-02-05 23:51:00.511060: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154414080 exceeds 10% of free system memory.
2024-02-05 23:51:00.674503: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154414080 exceeds 10% of free system memory.
2024-02-05 23:51:04.179761: W external/local_tsl/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 154414080 exceeds 10% of free system memory.
All model checkpoint layers were used when initializing TFRobertaForSequenceClassification.

Some layers of TFRobertaForSequenceClassification were not initialized from the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task 

In [None]:
from transformers import TFTrainingArguments
training_args = TFTrainingArguments("my_model")
from transformers import TFTrainer
trainer = TFTrainer(model=model, args=training_args, train_dataset=full_train_dataset, tokenizer=tokenizer)


ImportError: cannot import name 'TFTrainer' from 'transformers' (/mnt/Data/projects/roberta/lib/python3.11/site-packages/transformers/__init__.py)

In [86]:
def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

In [90]:


inputs = preprocess("Apple is bad")
encoded_input = tokenizer(inputs, padding=True,return_tensors='tf')

print(encoded_input)
output = model(encoded_input)

{'input_ids': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[    0, 20770,    16,  1099,     2]], dtype=int32)>, 'attention_mask': <tf.Tensor: shape=(1, 5), dtype=int32, numpy=array([[1, 1, 1, 1, 1]], dtype=int32)>}


In [89]:
print(output)

TFSequenceClassifierOutput(loss=None, logits=<tf.Tensor: shape=(1, 3), dtype=float32, numpy=array([[ 0.18790671, -0.22505452, -0.12364019]], dtype=float32)>, hidden_states=None, attentions=None)


### How to fine tune

In [91]:
class BERTForClassification(tf.keras.Model):
    
    def __init__(self, bert_model, num_classes):
        super().__init__()
        self.bert = bert_model
        self.fc = tf.keras.layers.Dense(num_classes, activation='softmax')
        
    def call(self, inputs):
        x = self.bert(inputs)[1]
        return self.fc(x)

In [92]:
classifier = BERTForClassification(model, num_classes=6)

classifier.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=['accuracy']
)

In [94]:
history = classifier.fit(
    train_dataset,
    epochs=3
)

NameError: name 'train_dataset' is not defined

In [95]:
classifier.evaluate(test_dataset)

NameError: name 'test_dataset' is not defined