In [0]:
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.utils import shuffle
from sklearn.svm import SVC
from sklearn import model_selection
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

from tqdm import tqdm
import numpy as np
import pandas as pd
import re
import tensorflow as tf
import tensorflow_hub as hub

# Use the following statement to install tensorflow-text otherwise the embedding won't load
# !pip install tensorflow-text==2.0.0 --user
import tensorflow_text as text

In [0]:
df_labled = pd.read_csv("/content/df_labeled.csv")

In [10]:
df_labled.content

0       The fact their might be so many marital rape a...
1       ‘If the child has been through sexual abuse, a...
2       ‘If you have any doubt & pick up any signs of ...
3       FUCCBOIS 8/10: a great film and yeah, it talks...
4       #Lockdown New Time Zone for #Sexual Couples......
                              ...                        
1267    judicial process not only has failed to reach ...
1268    @ncwindia @sharmarekha please help this lady h...
1269    my mum beat my elder sister. mama went write s...
1270    some of us are too ^scared^ to take a no from ...
1271    a husband who forces his wife to have sex with...
Name: content, Length: 1272, dtype: object

In [0]:
train, test, y_train, y_test = train_test_split(df_labled.content, 
                                                  df_labled.label.values, 
                                                  test_size=0.15, 
                                                  random_state=17, 
                                                  stratify=df_labled.label.values)

In [0]:
embedding = hub.load("https://tfhub.dev/google/universal-sentence-encoder-multilingual-large/3")

In [0]:
def get_embedding(sen):
  encoding = embedding(sen)
  proper_encoding = tf.reshape(encoding, [-1]).numpy()
  return proper_encoding

In [24]:
xtrain = []
for sentence in tqdm(train.values):
    xtrain.append(get_embedding(sentence))

xtrain = np.array(xtrain)


xtest = []
for sentence in tqdm(test.values):
    xtest.append(get_embedding(sentence))

xtest = np.array(xtest)

100%|██████████| 1081/1081 [01:37<00:00, 11.06it/s]
100%|██████████| 191/191 [00:17<00:00, 10.82it/s]


In [25]:
def svc_param_selection(X, y, nfolds):
  Cs = [1.07]
  gammas = [2.075]
  param_grid = {'C': Cs, 'gamma' : gammas}
  grid_search = GridSearchCV(SVC(kernel='rbf'), param_grid, cv=nfolds, n_jobs=8)
  grid_search.fit(X, y)
  grid_search.best_params_
  return grid_search


model = svc_param_selection(xtrain,y_train, 5)
model.best_params_

{'C': 1.07, 'gamma': 2.075}

In [0]:
pred = model.predict(xtest)

In [27]:
confusion_matrix(y_test,pred)


array([[  0,   5,   0,   0,   0,   0],
       [  0, 131,   4,   0,   0,   0],
       [  0,  18,  24,   0,   0,   0],
       [  0,   4,   0,   0,   0,   0],
       [  0,   4,   0,   0,   0,   0],
       [  0,   1,   0,   0,   0,   0]])

In [28]:
from sklearn.metrics import f1_score
accuracy = accuracy_score(y_test,pred)
print(accuracy)

f1 = f1_score(y_test,pred,average='weighted')
print(f1)

0.8115183246073299
0.7722056291507082


In [0]:
label_dict = {0:'DV_OPINION_ADVOCATE',1:'NON_D_VIOLENCE_DIRECTED',
              2:'NO_VIOLENCE',3:'DV_OPINION_INFO_NEWS',
              4:'DV_STORY',5:'DV_OPINION_DENIER'}

In [33]:
sentence = 'The fact their might be so many marital rape and other sexual assaults within homes taking place right now with a greater rate, and no one wont be talking about it bcz CORONA.'
sample = get_embedding(sentence)
pred = model.predict([sample])
label_dict[pred[0]]

'DV_OPINION_ADVOCATE'

# Inference

In [36]:
label_dict[model.predict([get_embedding(input())])[0]]

'some of us are too ^scared^ to take a no from our wives ! sexual needs k irf gird chakar lgayi ja rahe hain rape is a cruel, ruthless act. u wont do that to ur wife, would you? mjhe rounds arhe hain. -insecurities 101'


'DV_OPINION_DENIER'