# Importing Libraries

In [None]:
import pandas as pd

from transformers import DistilBertTokenizer
from transformers import TFDistilBertForSequenceClassification 
from transformers import DistilBertForSequenceClassification
from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments
import tensorflow as tf
import pandas as pd 
from sklearn.model_selection import train_test_split

# Reading Data

In [None]:
df = pd.read_csv("../Data/cleaned_data_10000.csv")

# Prepairing Data

In [None]:
# Shuffling data
df = df.sample(frac = 1)

# Picking first 1000 rows of data
df = df.head(1000)

# Renaming values
df["Score"] = df["Score"].replace([-1, 0, 1], ["negative", "neutral", "positive"])

df.head()

In [None]:
# Encoding text
df["encoded_score"] = df["Score"].astype("category").cat.codes
df.head()

In [None]:
# Getting lists of text and labels
data_texts = df["Text"].to_list()

data_labels = df["encoded_score"].to_list()

type(data_labels)

In [None]:
# Train Test Split

train_texts, val_texts, train_labels, val_labels = train_test_split(data_texts, data_labels, test_size = 0.2, random_state = 0)

In [None]:
# Initiliazing Tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")

# Tokenizing texts
train_encodings = tokenizer(train_texts, truncation=True, padding=True)

val_encodings = tokenizer(val_texts, truncation=True, padding=True)

In [None]:
# Converting DataFrames to DataSets
train_dataset = tf.data.Dataset.from_tensor_slices((
    dict(train_encodings),
    train_labels
))

val_dataset = tf.data.Dataset.from_tensor_slices((
    dict(val_encodings),
    val_labels
))

In [None]:
# Initializing training arguments
training_args = TFTrainingArguments(
    output_dir='./results',
    num_train_epochs=7,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=1e-5,
    logging_dir='./logs',
    eval_steps=100
)

In [None]:
# Initializing trainer model
with training_args.strategy.scope():
    trainer_model = TFDistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels = 3 )

trainer = TFTrainer(
  model=trainer_model, 
  args=training_args, 
  train_dataset=train_dataset,
  eval_dataset=val_dataset,
)

In [None]:
# Training model
trainer.train()

# Evualiting model
trainer.evaluate()

In [None]:
# Saving model
save_directory = "finetuned_distilled_bert"

trainer_model.save_pretrained(save_directory)

tokenizer.save_pretrained(save_directory)

In [None]:
save_directory = "finetuned_distilled_bert"


In [None]:
# Loadind model

tokenizer_fine_tuned = DistilBertTokenizer.from_pretrained(save_directory)

model_fine_tuned = TFDistilBertForSequenceClassification.from_pretrained(save_directory)

In [None]:
# Initializing Toknizer and Model

tokenizer_fine_tuned_pt = DistilBertTokenizer.from_pretrained(save_directory)

model_fine_tuned_pt = DistilBertForSequenceClassification.from_pretrained(save_directory, from_tf = True)

In [None]:
# Predicting

# Text to Classify
test_text = "oreos are bad"

# Tokenizing text
predict_input = tokenizer_fine_tuned.encode(
    test_text,
    truncation = True,
    padding = True,
    return_tensors = "tf"
)

# Passing tokenized text to model
output = model_fine_tuned(predict_input)[0]

# Getting classfication
prediction_value = tf.argmax(output, axis=1).numpy()[0]
prediction_value 

# Positive : 2
# Neutral : 1
# Negative : 0


In [None]:
##