# Installing required library

In [None]:
# %pip install pandas==2.2.2
# %pip install numpy==1.26.4
# %pip install nltk==3.8.1
# %pip install pycontractions
# %pip install scikit-learn
# %pip install tqdm
# %pip install tensorflow=2.16.1
# %pip install numpy==1.26.4
# %pip install seaborn
# %pip install matplotlib
# %pip install langdetect
# %pip install prettytable==3.11.0
# %pip install keras==3.3.3
# %pip install openpyxl

# Downloading and Loading the required file and model from google drive

### Downloding the model

In [None]:
import requests
import os
from tqdm import tqdm

class Download:
    """This class helps to download files from Google Drive"""

    def __init__(self):
        pass

    def download_file_from_google_drive(self, file_id, destination):
        """Download file from Google Drive.
        
        Arguments:
            file_id {string} -- Unique ID of the file in Google Drive.
            destination {string} -- Destination path where the file will be saved.
        """
        def get_confirm_token(response):
            """Retrieve confirmation token from the response cookies.
            
            Arguments:
                response {requests.Response} -- Response object from the initial request.
                
            Returns:
                token {string} -- Confirmation token if present, otherwise None.
            """
            for key, value in response.cookies.items():
                if key.startswith('download_warning'):
                    return value
            return None

        def save_response_content(response, destination):
            """Save the content of the response to a file.
            
            Arguments:
                response {requests.Response} -- Response object containing the file content.
                destination {string} -- Destination path where the file will be saved.
            """
            CHUNK_SIZE = 32768  # Define chunk size for streaming download
            total_size = int(response.headers.get('content-length', 0))  # Get total file size from headers
            with open(destination, "wb") as f, tqdm(
                desc=destination,
                total=total_size,
                unit='B',
                unit_scale=True,
                unit_divisor=1024,
            ) as bar:
                for chunk in response.iter_content(CHUNK_SIZE):
                    if chunk:  # Filter out keep-alive new chunks
                        f.write(chunk)
                        bar.update(len(chunk))

        URL = "https://docs.google.com/uc?export=download"  # Google Drive download URL
        session = requests.Session()  # Create a session object
        response = session.get(URL, params={'id': file_id}, stream=True)  # Initial request to get the file
        token = get_confirm_token(response)  # Check for confirmation token

        if token:
            # If confirmation token exists, make another request with the token
            params = {'id': file_id, 'confirm': token}
            response = session.get(URL, params=params, stream=True)

        # Save the content of the response to the specified destination
        save_response_content(response, destination)

# Instantiate the Download class
downloader = Download()

# Dictionary containing filenames and their corresponding Google Drive file IDs
file_ids = {
    'Multi_Model_Multi_Output.h5': '1L8V2by-UzjxPSL7NyZckTkarSt8AHGBw',
    'Multi_Model_Multi_Output.pkl': '1P5_Nz52s_Wokuymp26aPOp9GgOe8nuXF',
    'Mutlilabel_LSTM_Offensive_Profane.h5': '1bwOOub0XeazLwXisQ981EDMXuvsQvHhE',
    'Mutlilabel_LSTM_Offensive_Profane.pkl': '18xonXzSUxkpE1NS4dJnZR37uySvNG2B6',
    'Binomial_LSTM_Profane.h5': '1dr87w_4iWdiV2EUOGi4asHF1RNtWeS4f',
    'Binomial_LSTM_Profane.pkl': '1kJTe25qdBu6q5i6Gk5VYtDAA06Puj9Ml',
    'Binomial_LSTM_Offensive.keras': '1rDBl9_WvaA7YWNx_sZ08BQFGTTKCk5wS',
    'Binomial_LSTM_Offensive.pkl': '1UdLUoalPqotM5tYHwYwZDXffp0mbnLZc',
    "Emoji Sheets - Emoji Only.csv":"1bHK-ofASD0XC-Z1gtbKbalWj6B_71umw"
}

# Specify your download directory
download_dir = './downloaded_files/'

# Ensure the download directory exists
os.makedirs(download_dir, exist_ok=True)

# Download each file
for filename, file_id in file_ids.items():
    try:
        if os.path.exists(os.path.join(download_dir, filename)):
            # Check if file already exists to avoid re-downloading
            print(f"File {filename} already downloaded")
        else:
            raise FileNotFoundError
    except FileNotFoundError:
        # Download the file if it does not exist
        print(f"Downloading {filename} : https://drive.google.com/file/d/{file_id}")
        destination_path = os.path.join(download_dir, filename)
        downloader.download_file_from_google_drive(file_id, destination_path)

### Loading the model

In [None]:
import tensorflow as tf
import pickle
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from transformers import BertTokenizer
from nltk.util import ngrams
import torch
from transformers import BertModel
from prettytable import PrettyTable

# Example usage to download your models and tokenizers
if __name__ == "__main__":
    
    # After downloading, load your models and tokenizers
    import tensorflow as tf
    import pickle

    # Load Multi Model Multi Output
    # Load the Keras model from the specified file
    multi_model = tf.keras.models.load_model(os.path.join(download_dir, 'Multi_Model_Multi_Output.h5'))
    # Load the tokenizer using pickle
    with open(os.path.join(download_dir, 'Multi_Model_Multi_Output.pkl'), 'rb') as f:
        multi_tokenizer = pickle.load(f)

    # Load Multilabel LSTM Offensive Profane
    # Load the Keras model for multilabel offensive and profane text classification
    multilabel_lstm_model = tf.keras.models.load_model(os.path.join(download_dir, 'Mutlilabel_LSTM_Offensive_Profane.h5'))
    # Load the tokenizer for the multilabel model
    with open(os.path.join(download_dir, 'Mutlilabel_LSTM_Offensive_Profane.pkl'), 'rb') as f:
        multilabel_tokenizer = pickle.load(f)

    # Load Binomial LSTM Profane
    # Load the Keras model for binomial classification of profane text
    binomial_lstm_model_profanity = load_model(os.path.join(download_dir, 'Binomial_LSTM_Profane.h5'))
    # Load the tokenizer for the binomial model
    with open(os.path.join(download_dir, 'Binomial_LSTM_Profane.pkl'), 'rb') as f:
        binomial_tokenizer_profanity = pickle.load(f)

    # Use the same h5 file for another model
    # Load the Keras model for binomial classification of offensive text
    binomial_lstm_model_offensive = tf.keras.models.load_model(os.path.join(download_dir, 'Binomial_LSTM_Offensive.keras'))
    # Load the tokenizer for the binomial offensive model
    with open(os.path.join(download_dir, 'Binomial_LSTM_Offensive.pkl'), 'rb') as f:
        binomial_tokenizer_offensive = pickle.load(f)

    # Print a message to indicate successful loading of models and tokenizers
    print("Models and tokenizers loaded successfully.")


### loading the emoji files

In [None]:
import pandas as pd
import re

# Load the CSV file containing the list of emojis
emoji_df = pd.read_csv(os.path.join(download_dir, 'Emoji Sheets - Emoji Only.csv'))

# Extract the emojis into a list from the DataFrame
emoji_list = emoji_df['Emoji_List'].tolist()

# Start building the regular expression pattern for emojis
pattern = '['

# Append each emoji code point to the pattern string, ensuring each one is 8 digits
for cp in emoji_list:
    pattern += f'\\U{cp[1:]:0>8}'

# Close the pattern string
pattern += ']'

# Compile the regular expression to match emojis
emoji_pattern = re.compile(pattern, re.UNICODE)

def remove_emojis(text):
    """Remove emojis from the given text using the compiled emoji pattern.
    
    Arguments:
        text {str} -- Input text from which emojis need to be removed.
        
    Returns:
        str -- Text with emojis removed.
    """
    return emoji_pattern.sub(r'', text)

# Preprocessing

### Vectorization and labelling

In [None]:
import tensorflow as tf
import pickle
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import load_model
from transformers import BertTokenizer
from nltk.util import ngrams
import torch
from transformers import BertModel
from prettytable import PrettyTable

# Initialize BERT model and tokenizer
model_name = "bert-base-multilingual-cased"
tokenizer_bert = BertTokenizer.from_pretrained(model_name)
model_bert = BertModel.from_pretrained(model_name)

# Set device to GPU if available, otherwise use CPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_bert.to(device)  # Move model to the specified device
model_bert.eval()  # Set model to evaluation mode

def get_ngram_embeddings(text, n):
    """Generate embeddings for n-grams of the given text using BERT.
    
    Arguments:
        text {str} -- Input text to be tokenized and embedded.
        n {int} -- The 'n' in n-grams.
        
    Returns:
        np.array -- Array of n-gram embeddings.
    """
    tokenized_text = tokenizer_bert.encode(text, add_special_tokens=True)
    text_ngrams = list(ngrams(tokenized_text, n))
    embeddings = []

    for gram in text_ngrams:
        input_ids = torch.tensor(gram).unsqueeze(0).to(device)
        with torch.no_grad():
            outputs = model_bert(input_ids)
        last_hidden_states = outputs.last_hidden_state
        sentence_embedding = torch.mean(last_hidden_states, dim=1).squeeze().cpu().numpy()
        embeddings.append(sentence_embedding)

    embeddings_array = np.array(embeddings)
    return embeddings_array

def predict_labels(text):
    """Predict labels using a multi-output model.
    
    Arguments:
        text {str} -- Input text for prediction.
        
    Returns:
        tuple -- Gender label, profanity label, prediction probabilities, and prediction accuracy.
    """
    n = 2  # Define the 'n' for n-grams
    embeddings = get_ngram_embeddings(text, n)
    max_len = multi_model.input_shape[1]  # Get the maximum input length for the model
    padded_embeddings = pad_sequences([embeddings], maxlen=max_len, padding='post', dtype='float32')
    predictions = multi_model.predict(padded_embeddings)
    gender_pred = predictions[0]
    profanity_pred = predictions[1]
    gender_label = (gender_pred > 0.5).astype(int).flatten()[0]
    profanity_label = np.argmax(profanity_pred, axis=1)[0]
    pred_accuracy = predictions[1][0][profanity_label]
    return gender_label, profanity_label, predictions, pred_accuracy

def preprocess_text(text, tokenizer):
    """Preprocess text for model input.
    
    Arguments:
        text {str} -- Input text to be tokenized and padded.
        tokenizer {Tokenizer} -- Tokenizer to convert text to sequences.
        
    Returns:
        np.array -- Padded sequences of the input text.
    """
    sequences = tokenizer.texts_to_sequences([text])
    padded_sequences = pad_sequences(sequences, padding='post', maxlen=500)
    return padded_sequences

def predict_multilabel(text):
    """Predict labels using a multilabel model.
    
    Arguments:
        text {str} -- Input text for prediction.
        
    Returns:
        tuple -- Profanity label, prediction probabilities, and prediction accuracy.
    """
    preprocessed_text = preprocess_text(text, multilabel_tokenizer)
    prediction = multilabel_lstm_model.predict(preprocessed_text)
    profanity_pred = np.argmax(prediction, axis=1)[0]
    pred_accuracy = prediction[0][profanity_pred]
    return profanity_pred, prediction, pred_accuracy

def predict_binomial(text, model, tokenizer):
    """Predict binary labels using a binomial model.
    
    Arguments:
        text {str} -- Input text for prediction.
        model {Model} -- Binomial classification model.
        tokenizer {Tokenizer} -- Tokenizer to convert text to sequences.
        
    Returns:
        tuple -- Predicted label and prediction accuracy.
    """
    input_sequence = tokenizer.texts_to_sequences([text])
    input_padded = pad_sequences(input_sequence, maxlen=500, padding='post')
    prediction = model.predict(input_padded)
    predicted_label = prediction.argmax(axis=-1)[0]
    pred_accuracy = prediction[0][predicted_label]
    return predicted_label, pred_accuracy


### English to Nepali Transliteration

In [None]:
from langdetect import detect, LangDetectException
from ai4bharat.transliteration import XlitEngine

# Initialize the transliteration engine for Nepali with beam width 10
e = XlitEngine(["ne"], beam_width=10, src_script_type="en")

def nepali_nlp_text_conversion(text):
    """Convert the given text to Nepali using transliteration if it is not already in Nepali.
    
    Arguments:
        text {str} -- Input text to be converted.
        
    Returns:
        str -- Converted text if applicable, otherwise the original text.
    """
    # Check if the input is a non-empty string
    if isinstance(text, str) and text.strip():
        try:
            # Detect the language of the input text
            lang = detect(text)
            if lang != "ne":
                # If the detected language is not Nepali, perform transliteration
                temp_results = e.translit_sentence(text)["ne"]
                return temp_results
            # If the detected language is Nepali, return the original text
        except LangDetectException:
            # Handle language detection failure
            print("Failed to detect language")
            pass
    return text

# Text Prediction

### Gettting the input from user and print the result in table

In [None]:
# Take text input from the user
user_text = input("Enter text: ")

# Convert Nepali text, remove emojis
user_text = nepali_nlp_text_conversion(remove_emojis(user_text))

# Predict using the multi-output model
gender_label, multi_profanity_label, multi_predictions, multi_pred_accuracy = predict_labels(user_text)
# Determine gender from label
gender = 'Male' if gender_label == 1 else 'Female'
# Define profanity classes
multi_profanity_classes = {0: 'Non-Offensive', 1: 'Offensive', 2: 'Profane'}
# Get profanity class from label
multi_profanity = multi_profanity_classes[multi_profanity_label]

# Predict using the multilabel LSTM model
multilabel_profanity_label, multilabel_predictions, multilabel_pred_accuracy = predict_multilabel(user_text)
# Define multilabel classes
multilabel_classes = ['Non-Offensive', 'Offensive', 'Profane']
# Get multilabel profanity class from label
multilabel_profanity = multilabel_classes[multilabel_profanity_label]

# Predict using the binomial LSTM models
binomial_profanity, binomial_profanity_pred_accuracy = predict_binomial(user_text, binomial_lstm_model_profanity, binomial_tokenizer_profanity)
# Determine profanity from binomial model
binomial_profanity = 'Profane' if binomial_profanity == 1 else 'Non-Profane'

binomial_offensive, binomial_offensive_pred_accuracy = predict_binomial(user_text, binomial_lstm_model_offensive, binomial_tokenizer_offensive)
# Determine offensiveness from binomial model
binomial_offensive = 'Offensive' if binomial_offensive == 1 else 'Non-Offensive'

# Create the additional information table with the required format
additional_info_table = PrettyTable()
additional_info_table.field_names = ["--------Test Result Of--------"]
# Add the user's text to the table
additional_info_table.add_row([f"                              {user_text}                            "])

# Display the results in a table
table = PrettyTable()
table.field_names = ["Model", "Gender", "Profanity/Offensiveness", "Pred Accuracy"]
table.add_row(["Multi-Output Model", gender, multi_profanity, multi_pred_accuracy])
table.add_row(["Multilabel LSTM Model", "-", multilabel_profanity, multilabel_pred_accuracy])
table.add_row(["Binomial LSTM Profanity Model", "-", binomial_profanity, binomial_profanity_pred_accuracy])
table.add_row(["Binomial LSTM Offensive Model", "-", binomial_offensive, binomial_offensive_pred_accuracy])

# Print the tables
print(additional_info_table)
print(table)

### Verifying the results

In [None]:
# Print the results
print(additional_info_table)
print(table)

# Ask the user to verify the results
print("Verify the results:")

# Create a dictionary to store the user's responses
user_responses = {}

# List of models for user verification
model_list = ["Multi-Output Model", "Multilabel LSTM Model", "Binomial LSTM Profanity Model", "Binomial LSTM Offensive Model"]

# Get the user's responses
for i, model in enumerate(model_list, start=1):
    response = input(f"Was the result correct for {model}? (1 for yes/0 for no): ")
    user_responses[model] = response

# Print the user's responses
print("\nUser Responses about the correctness of model:")
for model, response in user_responses.items():
    print(f"{model}: {response}")



### Saving the verified result in excel

In [None]:
import pandas as pd
import os

# Specify the file name
tested_file_name = 'testeddata.xlsx'

# Try to import the Excel file
try:
    if os.path.exists(tested_file_name):
        # Read the existing Excel file into a DataFrame
        tested_df = pd.read_excel(tested_file_name,index_col=False)
        print("Excel file imported successfully.")
    else:
        # If the file doesn't exist, raise FileNotFoundError
        raise FileNotFoundError
except FileNotFoundError:
    # Create a new DataFrame with specified columns if the file is not found
    tested_df = pd.DataFrame(columns=[
        'user_text', 'gender', 'multi_profanity', 'multi_pred_accuracy', 'multi_profanity_correctness',
        'multilabel_profanity', 'multilabel_pred_accuracy', 'multilabel_profanity_correctness',
        'binomial_profanity', 'binomial_profanity_pred_accuracy', 'binomial_profanity_correctness',
        'binomial_offensive', 'binomial_offensive_pred_accuracy', 'binomial_offensive_correctness'
    ])
    print("Excel file not found. New DataFrame created.")
    
# Print the DataFrame
tested_df


In [None]:
# Create a dictionary with the data
new_row = {
    'user_text': [user_text],
    'gender': [gender],
    'multi_profanity': [multi_profanity],
    'multi_pred_accuracy': [multi_pred_accuracy],
    'multi_profanity_correctness': [user_responses[model_list[0]]],
    'multilabel_profanity': [multilabel_profanity],
    'multilabel_pred_accuracy': [multilabel_pred_accuracy],
    'multilabel_profanity_correctness': [user_responses[model_list[1]]],
    'binomial_profanity': [binomial_profanity],
    'binomial_profanity_pred_accuracy': [binomial_profanity_pred_accuracy],
    'binomial_profanity_correctness': [user_responses[model_list[2]]],
    'binomial_offensive': [binomial_offensive],
    'binomial_offensive_pred_accuracy': [binomial_offensive_pred_accuracy],
    'binomial_offensive_correctness': [user_responses[model_list[3]]]
}

# Create a new DataFrame
new_row = pd.DataFrame(new_row)


# Assign the new row to the DataFrame
tested_df.loc[len(tested_df)] = new_row.iloc[0].values
tested_df.to_excel(tested_file_name,index=False)