In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
import json
import numpy as np
import pandas as pd
import random
from matplotlib import pyplot as plt
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS
import missingno as msno

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

from keras.preprocessing import text
import keras
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,Dropout
from keras.callbacks import ReduceLROnPlateau

from tensorflow.keras.preprocessing.sequence import pad_sequences
import nltk
from nltk import word_tokenize
from nltk.stem import PorterStemmer

import torch
from torch.utils.data import Dataset

from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
from transformers import pipeline
from transformers import DistilBertTokenizerFast
from transformers import BertForSequenceClassification, BertTokenizerFast

from transformers import BertTokenizer, TFBertForSequenceClassification, BertConfig
from transformers import TrainingArguments, Trainer

In [None]:
!pip install TFTrainer


In [None]:
#from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments

In [None]:
!pip install transformers[tf]

In [None]:
#from transformers import TFDistilBertForSequenceClassification, TFTrainer, TFTrainingArguments


In [None]:
!pip install transformers


In [None]:
from transformers import TFDistilBertForSequenceClassification, TrainingArguments

In [None]:
def load_json_file(filename):
    with open(filename) as f:
        file = json.load(f)
    return file

filename = 'intents.json'

intents = load_json_file(filename)

In [None]:
def create_df():
    df = pd.DataFrame({
        'Pattern' : [],
        'Tag' : []
    })

    return df

df = create_df()
df

In [None]:
def extract_json_info(json_file, df):

    for intent in json_file['intents']:

        for pattern in intent['patterns']:

            sentence_tag = [pattern, intent['tag']]
            df.loc[len(df.index)] = sentence_tag

    return df

df = extract_json_info(intents, df)
df.head()


In [None]:
df2 = df.copy()
df2.head()


In [None]:
def print_shape_df(df, ds_name="df"):
    print(f"{ds_name} dataset has {df.shape[0]} rows and {df.shape[1]} columns")

print_shape_df(df, "Chatbot")

In [None]:
def print_dfInfo(df, ds_name="df"):
    print(f"The info of {ds_name} dataset\n")
    print(df.info())

print_dfInfo(df, "Chatbot")

In [None]:
def num_classes(df, target_col, ds_name="df"):
    print(f"The {ds_name} dataset has {len(df[target_col].unique())} classes")

num_classes(df, 'Tag', "Chatbot")


In [None]:
def check_null(df, ds_name='df'):
    print(f"Null Values in each col in the {ds_name} dataset:\n")
    print(df.isnull().sum())

check_null(df, "Chatbot")

In [None]:
pip install nltk

In [None]:
nltk.download('averaged_perceptron_tagger')

In [None]:
nltk.download('punkt')

In [None]:
stemmer = PorterStemmer()
ignore_words=['?', '!', ',', '.']

def preprocess_pattern(pattern):
    words = word_tokenize(pattern.lower())
    stemmed_words = [stemmer.stem(word) for word in words if word not in ignore_words]
    return " ".join(stemmed_words)

df['Pattern'] = df['Pattern'].apply(preprocess_pattern)

In [None]:
df.head()

In [None]:
print(df['Pattern'].isnull().sum())

In [None]:
labels = df2['Tag'].unique().tolist()
labels = [s.strip() for s in labels]
labels

In [None]:
num_labels = len(labels)
id2label = {id:label for id, label in enumerate(labels)}
label2id = {label:id for id, label in enumerate(labels)}

In [None]:
id2label

In [None]:
label2id

In [None]:
df2['labels'] = df2['Tag'].map(lambda x: label2id[x.strip()])
df2.head()

In [None]:
X = list(df2['Pattern'])
X[:5]

In [None]:
y = list(df2['labels'])
y[:5]

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,random_state = 123)

In [None]:
model_name = "bert-base-uncased"
max_len = 256

tokenizer = BertTokenizer.from_pretrained(model_name,
                                          max_length=max_len)

model = BertForSequenceClassification.from_pretrained(model_name,
                                                      num_labels=num_labels,
                                                      id2label=id2label,
                                                      label2id = label2id)


In [None]:
train_encoding = tokenizer(X_train, truncation=True, padding=True)
test_encoding = tokenizer(X_test, truncation=True, padding=True)

In [None]:
full_data = tokenizer(X, truncation=True, padding=True)

In [None]:
class DataLoader(Dataset):

    def __init__(self, encodings, labels):

        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):

        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):

        return len(self.labels)

In [None]:
train_dataloader = DataLoader(train_encoding, y_train)
test_dataloader = DataLoader(test_encoding, y_test)

In [None]:
fullDataLoader = DataLoader(full_data, y_test)

In [None]:
def compute_metrics(pred):

    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='macro')
    acc = accuracy_score(labels, preds)

    return {
        'Accuracy': acc,
        'F1': f1,
        'Precision': precision,
        'Recall': recall
    }

In [None]:
pip install accelerate -U

In [None]:
from transformers import TrainingArguments, Trainer

In [None]:
pip install transformers[torch]

In [None]:
!pip install accelerate>=0.21.0


In [None]:
training_args = TrainingArguments(
    output_dir='./output',
    do_train=True,
    do_eval=True,
    num_train_epochs=100,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=16,
    warmup_steps=100,
    weight_decay=0.05,
    logging_strategy='steps',
    logging_dir='./multi-class-logs',
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,
    save_strategy="steps",
    load_best_model_at_end=True
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataloader,
    eval_dataset=test_dataloader,
    compute_metrics= compute_metrics
)

In [None]:
trainer.train()

In [None]:
q=[trainer.evaluate(eval_dataset=df2) for df2 in [train_dataloader, test_dataloader]]

pd.DataFrame(q, index=["train","test"]).iloc[:,:5]


In [None]:
def predict(text):

    inputs = tokenizer(text, padding=True, truncation=True, max_length=512, return_tensors="pt").to("cuda")
    outputs = model(**inputs)

    probs = outputs[0].softmax(1)
    pred_label_idx = probs.argmax()
    pred_label = model.config.id2label[pred_label_idx.item()]

    return probs, pred_label_idx, pred_label

In [None]:
text = "hello, how can you assist me"
predict(text)

In [None]:
model_path = "chatbot"
trainer.save_model(model_path)
tokenizer.save_pretrained(model_path)

In [None]:
!pip install xformers

In [None]:
model_path = "chatbot"


model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer= BertTokenizerFast.from_pretrained(model_path)
chatbot= pipeline("sentiment-analysis", model=model, tokenizer=tokenizer)

In [None]:
chatbot("Hello")


In [55]:
from transformers import AutoProcessor,SeamlessM4TForTextToText
model = SeamlessM4TForTextToText.from_pretrained("facebook/hf-seamless-m4t-medium")

In [56]:
processor = AutoProcessor.from_pretrained("facebook/hf-seamless-m4t-medium")

preprocessor_config.json:   0%|          | 0.00/3.36k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/39.0k [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/4.85M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/4.33k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/3.29k [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [86]:
def chat(chatbot):


    inputs={"marathi":"mar","gujrati":"guj","tamil":"tam","hindi":"hin","english": "eng"}
    print("bol be kya chahiye :/")
    print("default language of output is english, to change type '/<language>'")
    print("supported languages are marathi, gujrati, tamil, hindi and english")
    print("Type 'quit' to exit the chat\n\n")

    text = input("User: ").strip().lower()
    text = text.split('/')
    language = "eng"

    while(text[0] != 'quit'):


        if len(text) > 1:
          try:
            language= inputs[text[1]]
          except:
            print("language not supported")


        score = chatbot(text[0])[0]['score']

        if score < 0.8:
            print("Chatbot: Sorry I can't answer that\n\n")
            text = input("User: ").strip().lower().split('/')
            continue

        label = label2id[chatbot(text)[0]['label']]
        response = random.choice(intents['intents'][label]['responses'])

        text_inputs = processor(text = response, src_lang="eng", return_tensors="pt")

        output_tokens = model.generate(**text_inputs, tgt_lang=language)
        translated_text_from_text = processor.decode(output_tokens[0].tolist(), skip_special_tokens=True)

        print(f"lang: {language}\n")
        print(f"Chatbot: {translated_text_from_text}\n\n")

        text = input("User: ").strip().lower().split('/')

In [None]:
chat(chatbot)

bol be kya chahiye :/
default language of output is english, to change type '/<language>'
supported languages are marathi, gujrati, tamil, hindi and english
Type 'quit' to exit the chat


User: hi
lang: eng

Chatbot: Hi there, how can I help?


User: hi /hindi
lang: hin

Chatbot: हैलो वहाँ, मैं कैसे मदद कर सकते हैं?


User: how can you help me
lang: hin

Chatbot: मैं कॉलेज के बारे में कम-मध्यम प्रश्नों का जवाब दे सकता हूँ


