In [1]:
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text

import pandas as pd 
df = pd.read_csv("hindi_train_val_utf.csv",encoding = "UTF-8")
# print(df.to_string()) 
print(df.head(5))
print(df.groupby('label').describe())
df['abuse']=df['label'].apply(lambda x: 1 if x==1 else 0)
print(df.head(10))

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['text'],df['abuse'], stratify=df['abuse'])

print(X_train.head(4))

bert_pre_process = hub.KerasLayer("https://tfhub.dev/google/MuRIL_preprocess/1")
bert_en_coder = hub.KerasLayer("https://tfhub.dev/google/MuRIL-Large/1")

def get_sentence_embeding(sentences):
    preprocessed_text = bert_pre_process(sentences)
    return bert_en_coder(preprocessed_text)['pooled_output']

# print(get_sentence_embeding([
#     "500$ discount. hurry up", 
#     "Bhavin, are you up for a volleybal game tomorrow?"]
# ))

# e = get_sentence_embeding([
#     "banana", 
#     "grapes",
#     "mango",
#     "jeff bezos",
#     "elon musk",
#     "bill gates"
# ]
# )

from sklearn.metrics.pairwise import cosine_similarity
# print(cosine_similarity([e[0]],[e[1]]))

# Bert layers
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessed_text = bert_pre_process(text_input)
outputs = bert_en_coder(preprocessed_text)

# Neural network layers
l = tf.keras.layers.Dropout(0.2, name="dropout")(outputs['pooled_output'])
l = tf.keras.layers.Dense(1, activation='sigmoid', name="output")(l)

# Use inputs and outputs to construct a final model
model = tf.keras.Model(inputs=[text_input], outputs = [l])

print(model.summary())

print(len(X_train))
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])
model.fit(X_train, y_train, epochs=2)
model.evaluate(X_test, y_test)
reviews = [
    '‡§™‡§ø‡§Ø‡§æ ‡§™‡§ø‡§Ø‡§æ ‡§ì‡§π ‡§™‡§ø‡§Ø‡§æ',
    '‡§§‡•Ç ‡§Æ‡§æ‡§¶‡§∞‡§ö‡•ã‡§¶ ‡§π‡•á ',
    '‡§§‡•á‡§∞‡•Ä ‡§Æ‡§æ‡§Å ‡§∏‡•ã‡§§‡•Ä ‡§π‡•á ‡§Æ‡•á‡§∞‡•á ‡§∏‡§æ‡§• ',
    '‡§Ü‡§ú‡§æ ‡§Æ‡§ú‡•á ‡§ï‡§∞‡§§‡•á ‡§π‡•á',
    "‡§∏‡•ã ‡§≤‡•å‡•ú‡•á ‡§π‡•á"
]
print(model.predict(reviews))






   label                                       text
0      0              ‡§≠‡•Ä‡§°‡§º ‡§Æ‡•á‡§Ç  ‡§¨‡§π‡•Å‡§§  ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á
1      0  ‡§∏‡§æ‡§≤‡•á ‡§¨‡•á‡§µ‡§ï‡•Ç‡§´ ‡§Ö‡§™‡§®‡•Ä ‡§Æ‡§æ‡§Ç ‡§Æ‡§ï‡•ç‡§ñ‡§ø‡§Ø‡§æ‡§Ç  ‡§§‡•ã ‡§π‡§ü‡§æ ‡§¶‡•á‡§Ç
2      0           ‡§¨‡•Å‡§∞ ‡§¶‡•á‡§¶‡•ã ‡§§‡•ã ‡§Æ‡•Å‡§π ‡§Æ‡•á‡§Ç ‡§≤‡§Ç‡§° ‡§≤‡•á ‡§≤‡•ã ‡§§‡•ã
3      0       ‡§ï‡•Å‡§§‡•ç‡§§‡§æ ‡§µ‡§π‡§æ ‡§π‡•à ‡§ö‡§ø‡§≤‡•ç‡§≤‡§æ ‡§§‡•Ç ‡§ï‡•ç‡§Ø‡•ã‡§Ç ‡§∞‡§π‡§æ ‡§π‡•à
4      1  ‡§ö‡§æ‡§Ø ‡§®‡§π‡•Ä‡§Ç ‡§™‡•Ä‡§§‡§æ ‡§π‡•Ç‡§Ç ‡§Æ‡•à‡§Ç ‡§á‡§∏‡•Ä ‡§ï‡•ã ‡§õ‡•ã‡§°‡§º ‡§¶‡§ø‡§Ø‡§æ ok
        text                                                       
       count unique                                        top freq
label                                                              
0      10527  10527              ‡§≠‡•Ä‡§°‡§º ‡§Æ‡•á‡§Ç  ‡§¨‡§π‡•Å‡§§  ‡§π‡§¨‡•ç‡§∏‡•Ä ‡§Æ‡§ø‡§≤‡•á‡§Ç‡§ó‡•á    1
1       9656   9656  ‡§ö‡§æ‡§Ø ‡§®‡§π‡•Ä‡§Ç ‡§™‡•Ä‡§§‡§æ ‡§π‡•Ç‡§Ç ‡§Æ‡•à‡§Ç ‡§á‡§∏‡•Ä ‡§ï‡•

In [3]:
print("hellow")

hellow


In [6]:
print(model.predict(['‡§Æ‡•Å‡§≤‡•ç‡§≤‡•ã ‡§ï‡•ã ‡§Æ‡§æ‡§∞ ‡§°‡§æ‡§≤‡•ã']))

[[0.20577273]]


In [7]:
print(model.predict(['‡§Æ‡•Å‡§≤‡•ç‡§≤‡•ã ‡§ï‡•ã ‡§Æ‡§æ‡§∞ ‡§°‡§æ‡§≤‡•ã']))

[[0.20577273]]


In [8]:
print(model.predict(['‡§°‡§æ‡§≤‡•ã']))

[[0.6448112]]


In [17]:
print(model.predict(['‡§°‡§æ‡§≤‡•ã']))

[[0.6448112]]
