In [141]:
import numpy as np
import pandas as pd
import re
import nltk

from nltk.corpus import stopwords
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score,accuracy_score,confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.multioutput import ClassifierChain
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import FeatureUnion
from sklearn.svm import SVC
import spacy
from sklearn.preprocessing import StandardScaler
from scipy import sparse



In [None]:
embeddings_index = {}
with open('glove/glove.twitter.27B.200d.txt', 'r',encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

In [24]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'\'', ' ', text)

    # Remove stopwords
    text = text.split()
    stops = set(stopwords.words('italian'))
    text = [w for w in text if not w in stops]
    #text = [w if w not in training_identity else training_identity[0] for w in text]
    text = " ".join(text)

    # Tokenize each word
    text = nltk.WordPunctTokenizer().tokenize(text)
    return text

In [197]:
training_identity = []
with open('AMI2020_TrainingSet\AMI2020_training_identityterms.txt') as f:
    for word in f.read().split():
        training_identity.append(word)

raw = pd.read_csv('AMI2020_TrainingSet\AMI2020_training_raw.tsv', sep='\t')
cleaned_text = list(map(clean_text, raw.text))
raw['clean'] = [' '.join(w) for w in cleaned_text]

In [9]:
def get_mean_embeddings(texts,embeddings):
    means = []
    dim = len(list(embeddings.values())[0])
    for text in texts :
        text = nltk.WordPunctTokenizer().tokenize(text)
        means.append(np.mean([embeddings[w] if w in embeddings else np.zeros(dim) for w in text], axis=0))
    return np.array(means)

In [255]:
def run_cv(k_fold, data, label):
    accuracy_scores = f1_scores = confusion_matrices = []
    labels = np.array(label)
    skf = StratifiedKFold(k_fold)
    cv_splits = skf.split(data,labels)
    min_inidices = ([],[])
    min_acc = 100
    media=0
    for train, test in cv_splits:
        traindata,y_train,= data[train],labels[train]
        testdata,y_test=data[test],labels[test]
        train_feature_matrix,test_feature_matrix=get_mean_embeddings(data[train],embeddings_index),\
                                                 get_mean_embeddings(data[test],embeddings_index)
        '''
        0.2% mai putin
        model = LogisticRegression(penalty='l2', dual=False, tol=0.0001,
                         C=0.6, fit_intercept=True, intercept_scaling=1.0,
                         solver = 'lbfgs', warm_start=False,
                         class_weight=None, random_state=None)
        '''
        model=SVC(kernel='rbf',C=1000,gamma=1e-3)
        model.fit(train_feature_matrix,y_train)
        result = model.predict(test_feature_matrix)
        score = accuracy_score(y_test, result)
        accuracy_scores.append(score)
        if score < min_acc:
            min_acc = score
            split_inidices = (train, test)
        f1sc = f1_score(y_test, result, average='weighted')
        media=media+f1sc
        print('f1score:',f1sc)
        f1_scores.append(f1sc)
    print (f'min cv acc:{min_acc}\nmedia:{media}')
    print(np.mean(f1_scores))


In [256]:
run_cv(10,raw['clean'],raw['misogynous'])



f1score: 0.8935227253419421
f1score: 0.8934570705536998
f1score: 0.857272679420994
f1score: 0.8511942446043165
f1score: 0.8917701238390093
f1score: 0.8691599430469862
f1score: 0.7660814349316576
f1score: 0.7722555603593217
f1score: 0.8236901160464185
f1score: 0.8277687728058382
min cv acc:0.766
media:8.446172670950185
0.8448086335475093
