In [None]:
# Tensorboard  (optional)
%load_ext tensorboard
#%tensorboard --logdir runs/train
%tensorboard --logdir ./logs
#%tensorboard --logdir {logs_base_dir}  --host localhost

In [None]:
################################################ Fine Tune ##################################
import joblib

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
import torch
from torch.utils.data import Dataset
import re

import matplotlib.pyplot as plt
import seaborn as sns
from transformers import TrainerCallback, IntervalStrategy


import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix
import re
import os

import scipy.special

import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch.utils.data import Dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer,
                          AdamW, get_linear_schedule_with_warmup)
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from datetime import datetime
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import re


def custom_tokenize(composition):
    matches = re.findall(r'([A-Z][a-z]*)([0-9.]+)', composition)
    sorted_matches = sorted(matches, key=lambda x: x[0])
    return ' '.join([f"{element}{fraction}" for element, fraction in sorted_matches])

# Load the data
data = pd.read_csv('Final_Formated_and_cleaned_file_No_Features.csv')

# Tokenize and normalize
data['tokenized_elements'] = data['composition'].apply(custom_tokenize)
label_encoder = LabelEncoder()
data['encoded_phase'] = label_encoder.fit_transform(data['Phase'])

feature_columns = [col for col in data.columns if col not in ['composition', 'Phase', 'tokenized_elements', 'encoded_phase']]
for feature in feature_columns:
    scaler = StandardScaler()
    data[f'normalized_{feature}'] = scaler.fit_transform(data[[feature]])

data['combined_features'] = data['tokenized_elements'] + ' ' + data[[f'normalized_{feature}' for feature in feature_columns]].astype(str).agg(' '.join, axis=1)

# Split data
data_train, data_test = train_test_split(data, test_size=0.2, random_state=42)

# Initialize tokenizer and model

tokenizer = AutoTokenizer.from_pretrained('./results/pretrained_BERT_1M')
model = AutoModelForSequenceClassification.from_pretrained('./results/pretrained_BERT_1M', num_labels=len(label_encoder.classes_))


# Tokenize
train_encodings = tokenizer(data_train['combined_features'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")
test_encodings = tokenizer(data_test['combined_features'].tolist(), truncation=True, padding=True, max_length=512, return_tensors="pt")

class CustomDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = CustomDataset(train_encodings, data_train['encoded_phase'].values)
test_dataset = CustomDataset(test_encodings, data_test['encoded_phase'].values)

def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    return {'accuracy': accuracy_score(p.label_ids, preds)}

# Initialize custom optimizer with weight decay for specific layers
decay_layers = ["10", "11", "12"]
decay_param_names = [n for n, p in model.named_parameters() if any(f".{layer}." in n for layer in decay_layers)]
no_decay_param_names = [n for n, p in model.named_parameters() if n not in decay_param_names]
decay_params = [p for n, p in model.named_parameters() if n in decay_param_names]
no_decay_params = [p for n, p in model.named_parameters() if n in no_decay_param_names]
optimizer_grouped_parameters = [
    {"params": decay_params, "weight_decay": 0.01},
    {"params": no_decay_params, "weight_decay": 0.0}
]
optimizer = AdamW(optimizer_grouped_parameters, lr=5e-5)

# Scheduler
num_training_steps = len(train_dataset) * 30
lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=num_training_steps)

current_time = datetime.now().strftime('%b%d_%H-%M-%S')
log_dir = './logs/' + current_time

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=12,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    logging_dir=log_dir,
    logging_steps=1,
    save_steps=500,
    evaluation_strategy="steps",
    load_best_model_at_end=True,
    report_to='tensorboard'
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics,
    optimizers=(optimizer, lr_scheduler)
)

trainer.train()

results = trainer.evaluate()
print(results)

predictions = trainer.predict(test_dataset)
pred_labels = np.argmax(predictions.predictions, axis=1)

class_report_str = classification_report(predictions.label_ids, pred_labels, target_names=label_encoder.classes_)
print("Classification Report:\n", class_report_str)

with open('./results/classification_report.txt', 'w') as f:
    f.write(class_report_str)

conf_mat = confusion_matrix(predictions.label_ids, pred_labels)
print("Confusion Matrix:", conf_mat)

with open('./results/confusion_matrix.txt', 'w') as f:
    np.savetxt(f, conf_mat, fmt='%d')

plt.figure(figsize=(10, 8))
sns.heatmap(conf_mat, annot=True, fmt="d", cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.savefig('./results/confusion_matrix.png', dpi=300)
plt.show()

tokenizer.save_pretrained('./results')
model.save_pretrained('./results')
