### Imports used

In [30]:
import pickle
from flair.embeddings import WordEmbeddings, FlairEmbeddings,TransformerWordEmbeddings,SentenceTransformerDocumentEmbeddings,StackedEmbeddings,DocumentPoolEmbeddings
from flair.data import Sentence
import pandas as pd
import numpy as np
import nltk
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import StratifiedKFold
from sklearn.svm import SVC
from scipy.sparse import hstack
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.base import clone
from sklearn.model_selection import GridSearchCV
import pandas as pd
import xgboost as xgb
from scipy.sparse import hstack
import spacy
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestClassifier

### Used for testing the model

In [19]:
def average_confusion(confusion_matrices):
    avg = np.mean(confusion_matrices, axis=0)
    total = sum(avg)
    return avg*100/total
def run_cv(classifier, k_fold, data, labels, runs=1):
    accuracy_scores = []
    f1scores = []
    auc_scores = [0]
    confusion_matrices = []
    min_acc = 100
    labels = np.array(labels)
    skf = StratifiedKFold(k_fold)
    for run in range(0, runs):
        print(f'r{run}------------')
        cv_splits = skf.split(data, labels)
        for train, test in cv_splits:
            traindata = data[train]
            y_traindata = labels[train]
            testdata = data[test]
            y_testdata = labels[test]
            model = clone(classifier)
            model.fit(traindata, y_traindata)
            result = model.predict(testdata)
            score = accuracy_score(y_testdata, result)
            accuracy_scores.append(score)
            f1sc = f1_score(y_testdata, result, average='weighted')  
            print(f1sc)
            f1scores.append(f1sc)
            #auc = roc_auc_score(y_testdata, result, average='weighted', multi_class='ovr')
            #auc_scores.append(auc)
            if f1sc < min_acc:
                min_acc = f1sc 
                split_inidices = (train, test)
            confusion_matrices.append(np.array(confusion_matrix(y_testdata, result)))
    print ("min cv F1: ", min_acc)   
    return (accuracy_scores, f1scores, auc_scores, average_confusion(confusion_matrices), split_inidices)



### Embeddings

In [101]:
dmbzembedding = TransformerWordEmbeddings('dbmdz/bert-base-italian-uncased')
#flair_embedding_forward = FlairEmbeddings('it-forward')
#flair_embedding_backward = FlairEmbeddings('it-backward')

# document_embeddings = DocumentPoolEmbeddings([dmbzembedding,
#                                               flair_embedding_backward,
#                                               flair_embedding_forward
#                                              ])
document_embeddings =DocumentPoolEmbeddings([dmbzembedding])

In [136]:
CORPUS = 'different processed dataframes/raw.tsv' 
DATA_FRAME = pd.read_csv(CORPUS, '\t',dtype=str)
X=DATA_FRAME['text'].values
Y=DATA_FRAME['misogynous'].values


#Used the first time to get the embeddings. Embeddings.pkl contains embedded sentences using the stacked document embeddings (all 3)
# data=[]
# for i,text in enumerate(X):
#     sentence=Sentence(text)
#     try:
#         document_embeddings.embed(sentence)
#         data.append(sentence.embedding)
#         print(i,'done')
#     except:
#         print(f'error at',i)



### Embedding and tfidf both perform worse with any other data than the original

In [134]:

# CORPUS = 'different processed dataframes/noun_chuncks,processed.tsv' 
# DATA_FRAME = pd.read_csv(CORPUS, '\t',dtype=str)
# X=DATA_FRAME['clean'].values.astype(str)
# Y=DATA_FRAME['misogynous'].values

# #Used the first time to get the embeddings. Failed now but it once fully worked. Data is saved in embeddings.pkl
# data=[]
# for i,text in enumerate(X):
#     sentence=Sentence(str(text))
#     try:
#         document_embeddings.embed(sentence)
#         data.append(sentence.embedding)
#         print(i,'done')
#     except:
#         print(f'error at',i)



#### Convert tensors to np array

In [128]:
#emb_array=np.array([tensor.cpu().detach().numpy() for tensor in data])

#### For loading other pkl'ed embeddings

In [None]:
# with open('tensors bert.pkl','wb') as f:
#     pickle.dump(emb_array, f)
    
# emb_array=np.array([tensor.cpu().detach().numpy() for tensor in data])
# with open('only_bert_embeddings.pkl','wb') as f:
#     pickle.dump(emb_array, f)
    

### Some models

In [139]:
logistic = LogisticRegression(penalty='l2', dual=True, tol=0.0001, max_iter=100000,
                         C=3, fit_intercept=True, intercept_scaling=1.0, 
                         solver = 'liblinear', warm_start=False,
                         class_weight=None, random_state=None)
svm=SVC()
forest=RandomForestClassifier(n_estimators=1000,n_jobs=-1)


### Embeddings only (nu mai rulez o data dar worse oricum, cu vreo 2%)

In [142]:
emb_array=[]
with open('embeddings.pkl','rb') as f:
    emb_array=pickle.load(f)
    
print(f'array of shape{emb_array.shape}\nfirst element:{emb_array[0]}')

run_cv(forest,10,emb_array,Y)

### TFIDF only

In [143]:
vectorizer = TfidfVectorizer(min_df=3,  max_features=None, 
                             strip_accents='unicode', analyzer='word', 
                             token_pattern=r'\b[^\d\W]+\b',
                             ngram_range=(1, 2),use_idf=True)
tfidf_X=vectorizer.fit_transform(X)
print(tfidf_X.shape)

run_cv(logistic,10,tfidf_X,Y)

(5000, 6722)
r0------------
0.9238025528069581
0.9178437000052386
0.915739551708546
0.8755490909090908
0.9398441206370721
0.8933878444261625
0.8080860215053762
0.8261761732678617
0.8858691023440595
0.8391070626723972
r1------------
0.9238025528069581
0.9178437000052386
0.915739551708546
0.8755490909090908
0.9398441206370721
0.8933878444261625
0.8080860215053762
0.8261761732678617
0.8858691023440595
0.8391070626723972
r2------------
0.9238025528069581
0.9178437000052386
0.915739551708546
0.8755490909090908
0.9398441206370721
0.8933878444261625
0.8080860215053762
0.8261761732678617
0.8858691023440595
0.8391070626723972
min cv F1:  0.8080860215053762


([0.924,
  0.918,
  0.916,
  0.876,
  0.94,
  0.894,
  0.808,
  0.826,
  0.886,
  0.84,
  0.924,
  0.918,
  0.916,
  0.876,
  0.94,
  0.894,
  0.808,
  0.826,
  0.886,
  0.84,
  0.924,
  0.918,
  0.916,
  0.876,
  0.94,
  0.894,
  0.808,
  0.826,
  0.886,
  0.84],
 [0.9238025528069581,
  0.9178437000052386,
  0.915739551708546,
  0.8755490909090908,
  0.9398441206370721,
  0.8933878444261625,
  0.8080860215053762,
  0.8261761732678617,
  0.8858691023440595,
  0.8391070626723972,
  0.9238025528069581,
  0.9178437000052386,
  0.915739551708546,
  0.8755490909090908,
  0.9398441206370721,
  0.8933878444261625,
  0.8080860215053762,
  0.8261761732678617,
  0.8858691023440595,
  0.8391070626723972,
  0.9238025528069581,
  0.9178437000052386,
  0.915739551708546,
  0.8755490909090908,
  0.9398441206370721,
  0.8933878444261625,
  0.8080860215053762,
  0.8261761732678617,
  0.8858691023440595,
  0.8391070626723972],
 [0],
 array([[89.77403294, 13.35286731],
        [10.22596706, 86.64713269]]

In [10]:
#arr_emb_tfidf=hstack((emb_array,tfidf_X)).toarray()
#run_cv(logistic,10,arr_emb_tfidf,Y)

### Trying with two models, one for the sparse tfidf features, and then using the predictions as a result for the random forest classifier (o aberatie deocamdata, am bagat predicitiile trainului la train features si dupa presupun ca forestu i-a acordat mare importanta so yeah e un tfidf mai prost, dar e pus aici as a memo)

In [150]:
def run_cv_2models(classifier_sparse,classifier_dense,sparse_features,dense_features, k_fold,data, labels, runs=1):
    accuracy_scores = []
    f1scores = []
    auc_scores = [0]
    confusion_matrices = []
    min_acc = 100
    labels = np.array(labels)
    skf = StratifiedKFold(k_fold)
    for run in range(0, runs):
        print(f'---{run}---')
        cv_splits = skf.split(data, labels)
        for train, test in cv_splits:
            
            traindata_sparse,y_train = sparse_features[train],labels[train]
            testdata_sparse,y_test = sparse_features[test],labels[test]
            
            traindata_dense,testdata_dense=dense_features[train],dense_features[test]
            
            #First model
            model = clone(classifier_sparse)
            model.fit(traindata_sparse, y_train)
            result = model.predict(testdata_sparse)
            result_train=model.predict(traindata_sparse)
            
            
            #Adding features for the second model
            result=result.reshape(500,1) ###Asta pentru 10 k-folduri
            result_train=result_train.reshape(4500,1)
            testdata_dense=np.hstack((testdata_dense,result))
            traindata_dense=np.hstack((traindata_dense,result_train))

            
            model=clone(classifier_dense)
            model.fit(traindata_dense,y_train)
            result=model.predict(testdata_dense)
            score = accuracy_score(y_test, result)
            accuracy_scores.append(score)
            f1sc = f1_score(y_test, result, average='weighted')  
            print(f1sc)
            f1scores.append(f1sc)
            #auc = roc_auc_score(y_testdata, result, average='weighted', multi_class='ovr')
            #auc_scores.append(auc)
            if f1sc < min_acc:
                min_acc = f1sc 
                split_inidices = (train, test)
            confusion_matrices.append(np.array(confusion_matrix(y_test, result)))
    print ("min cv F1: ", min_acc)   
    return (accuracy_scores, f1scores, auc_scores, average_confusion(confusion_matrices), split_inidices)



### Logistic pe sparse si random forest pe dense



In [151]:
run_cv_2models(logistic,forest,tfidf_X,emb_array,10,X,Y)

---0---
0.9216487978931271
0.9299479147813425
0.9118511094281248
0.8390217162133672
0.9133822697022481
0.8916072727272727
0.7896024876040003
0.7940271921087685
0.8658954695469547
0.8475131014158499
min cv F1:  0.7896024876040003


([0.922, 0.93, 0.912, 0.84, 0.914, 0.892, 0.79, 0.794, 0.866, 0.848],
 [0.9216487978931271,
  0.9299479147813425,
  0.9118511094281248,
  0.8390217162133672,
  0.9133822697022481,
  0.8916072727272727,
  0.7896024876040003,
  0.7940271921087685,
  0.8658954695469547,
  0.8475131014158499],
 [0],
 array([[88.68431147, 14.66778103],
        [11.31568853, 85.33221897]]),
 (array([   0,    1,    2, ..., 4997, 4998, 4999]),
  array([1404, 1405, 1406, 1407, 1408, 1409, 1410, 1411, 1412, 1413, 1414,
         1415, 1416, 1417, 1418, 1419, 1420, 1421, 1422, 1423, 1424, 1425,
         1426, 1427, 1428, 1429, 1430, 1431, 1432, 1433, 1434, 1435, 1436,
         1437, 1438, 1439, 1440, 1441, 1442, 1443, 1444, 1445, 1446, 1447,
         1448, 1449, 1450, 1451, 1452, 1453, 1454, 1455, 1456, 1457, 1458,
         1459, 1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469,
         1470, 1471, 1472, 1473, 1474, 1475, 1476, 1477, 1478, 1479, 1480,
         1481, 1482, 1483, 1484, 1485, 1486, 1487, 1