# Code to obtain relevant information for the keywords

In [12]:
#!pip install wordcloud

### Data Analysis

- Abuse: "1. threats, plans to harm and incitement", "dominance", "sexual_harassment", "objectification", "sexual-violence", "stereotyping-dominance", "misogyny-non-sexual-violence".
- Hate: "3. animosity", "2. derogation", "4. prejudiced discussions", "stereotype", "ideological-inequality", "misogyny-non-sexual-violence".
- Profanities: "Profanities" no aparece entre los términos listados.
- Violent: "1. threats, plans to harm and incitement", "sexual-violence".
- Sexually explicit: "sexual_harassment", "objectification", "sexual-violence".

In [20]:
import pandas as pd
import numpy as np

df = pd.read_csv('../data/transformed/cleaned_data.csv', delimiter = ';')

df['type'] = df['type'].replace( 'none' , None)
df['type'] = df['type'].replace('0', None)
df['type'] = df['type'].replace('non-sexist', None)
df['type'] = df['type'].replace('NaN', None)
df['type'] = df['type'].replace(np.nan, None)

df['type'] = df['type'].replace('dominance', 'Abuse')
df['type'] = df['type'].replace('stereotyping-dominance', 'Abuse')
df['type'] = df['type'].replace('derailing', 'Abuse')

df['type'] = df['type'].replace('2. derogation', 'Hate')
df['type'] = df['type'].replace('3. animosity', 'Hate')
df['type'] = df['type'].replace('4. prejudiced discussions', 'Hate')
df['type'] = df['type'].replace('ideological-inequality', 'Hate')

df['type'] = df['type'].replace('stereotype', 'Profanities')
df['type'] = df['type'].replace('misogyny-non-sexual-violence', 'Profanities')
df['type'] = df['type'].replace('discredit', 'Profanities')

df['type'] = df['type'].replace('misogyny-non-sexual-violence', 'Violent')
df['type'] = df['type'].replace('1. threats, plans to harm and incitement', 'Violent')
df['type'] = df['type'].replace('sexual-violence', 'Violent')

df['type'] = df['type'].replace('sexual_harassment', 'Sexually explicit')
df['type'] = df['type'].replace('objectification', 'Sexually explicit')

print(df['type'].unique())


subdf = df[df['label'] == 1]
df_es = subdf[subdf['language'] == 'es']
df_en = subdf[subdf['language'] == 'en']

print('Datos sexistas en inglés: ', len(df_en), '\nDatos sexistas en español: ', len(df_es))

[None 'Hate' 'Violent' 'Abuse' 'Sexually explicit' 'Profanities']
Datos sexistas en inglés:  13261 
Datos sexistas en español:  5276


In [36]:
#import nltk
nltk.download('punkt')
import string
import nltk
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize

def remove_punctuation_and_numbers(tokens):
    return " ".join([ token for token in tokens.split() if token not in string.punctuation and not token.isdigit()])
 
def remove_stopwords(tokens, lang):
    return " ".join([palabra for palabra in tokens.split() if palabra.lower() not in stopwords.words(lang)])

def deepclean(texts, lang):
    texts = texts.apply(lambda x: " ".join(nltk.word_tokenize(x.lower(), language=lang)))
    texts = texts.apply(remove_punctuation_and_numbers)
    texts = texts.apply(remove_stopwords, lang=lang)
    return texts

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\alvar\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [39]:
df_en.text = deepclean(df_en.text, "english")
df_es.text = deepclean(df_es.text, "spanish")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_en.text = deepclean(df_en.text, "english")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_es.text = deepclean(df_es.text, "spanish")


## Wordcloud en español general

In [None]:
#from wordcloud import WordCloud
#import matplotlib.pyplot as plt

def wordcloud(text):
    wordcloud = WordCloud(width=800, height=800, background_color='black').generate(text)
    plt.figure(figsize=(8,8))
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.tight_layout(pad=0)
    plt.show()

wordcloud(' '.join(df_es['clean_text']))

## Wordcloud por tipo

In [None]:
print('\nAbuse\n')
wordcloud(' '.join(df_es[df['type'] == 'Abuse']['clean_text']))
print('\nHate\n')
wordcloud(' '.join(df_es[df['type'] == 'Hate']['clean_text']))
print('\nProfanities\n')
wordcloud(' '.join(df_es[df['type'] == 'Profanities']['clean_text']))
print('\nViolent\n')
wordcloud(' '.join(df_es[df['type'] == 'Violent']['clean_text']))
print('\nSexually explicit\n')
wordcloud(' '.join(df_es[df['type'] == 'Sexually explicit']['clean_text']))

## Keyword selection by TF-IDF

In [42]:
from sklearn.feature_extraction.text import TfidfVectorizer

def word_freq_by_sexism(df, num, language):
    # Inicializar el diccionario de frecuencias
    freq_dict = {}

    # Obtener los tipos de sexismo únicos
    sexisms = ['Abuse','Hate', 'Profanities', 'Violent', 'Sexually explicit']

    # Crear un objeto TfidfVectorizer para generar la matriz TF-IDF
    vectorizer = TfidfVectorizer(stop_words = stopwords.words(language), lowercase=True)

    # Iterar por los tipos de sexismo y calcular las frecuencias de palabras correspondientes
    for sexism in sexisms:
        # Crear una submatriz TF-IDF que contenga solo los textos correspondientes a este tipo de sexismo
        sub_df = df[df['type'] == sexism]
        tfidf_matrix = vectorizer.fit_transform(sub_df['text'])

        # Obtener la lista de palabras en orden de importancia según su valor TF-IDF
        word_importance = [(word, tfidf_matrix.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]
        word_importance = sorted(word_importance, key=lambda x: -x[1])

        # Tomar las num palabras más importantes
        top_words = [word for word, score in word_importance[:num]]

        # Agregar las palabras más importantes al diccionario
        freq_dict[sexism] = top_words

    # Devolver el diccionario de frecuencias
    return freq_dict

In [43]:
keywords_es = word_freq_by_sexism(df_es, 15, 'spanish')
keywords_es

{'Abuse': ['perra',
  'cállate',
  'mujer',
  'puta',
  'callate',
  'mujeres',
  'si',
  'ser',
  'volante',
  'tonta',
  'sumisa',
  'rubia',
  'florero',
  'zorra',
  'así'],
 'Hate': ['mujeres',
  'feminismo',
  'si',
  'feministas',
  'mujer',
  'hombres',
  'hembrismo',
  'ser',
  'daño',
  'hombre',
  'feminazi',
  'igualdad',
  'solo',
  'todas',
  'hace'],
 'Profanities': ['puta',
  'perra',
  'zorra',
  'callate',
  'mujeres',
  'cállate',
  'mujer',
  'si',
  'mereces',
  'hija',
  'guarra',
  'ser',
  'mierda',
  'novia',
  'madre'],
 'Violent': ['follar',
  'sexual',
  'mujer',
  'mujeres',
  'años',
  'acoso',
  'violar',
  'violación',
  'si',
  'gustaría',
  'hombre',
  'ser',
  'siempre',
  'tetas',
  'acusado'],
 'Sexually explicit': ['tetas',
  'culo',
  'puta',
  'perra',
  'si',
  'mujer',
  'mujeres',
  'polla',
  'falda',
  'corta',
  'ser',
  'zorra',
  'coño',
  'siempre',
  'mereces']}

In [45]:
import json

with open('../data/extracted/keywords_es.json', 'w', encoding='utf-8') as f:
    json.dump(keywords_es, f, ensure_ascii=False, indent=4)

In [44]:
keywords_en = word_freq_by_sexism(df_en, 15, 'english')
keywords_en

{'Abuse': ['women',
  'bitch',
  'like',
  'men',
  'woman',
  'fuck',
  'shut',
  'stfu',
  'rape',
  'get',
  'dumb',
  'know',
  'man',
  'want',
  'gold'],
 'Hate': ['women',
  'men',
  'like',
  'bitch',
  'woman',
  'get',
  'would',
  'fuck',
  'female',
  'want',
  'man',
  'fucking',
  'know',
  'girls',
  'one'],
 'Profanities': ['bitch',
  'women',
  'whore',
  'like',
  'stupid',
  'hoe',
  'cunt',
  'woman',
  'ass',
  'fuck',
  'fucking',
  'girl',
  'bitches',
  'skank',
  'get'],
 'Violent': ['women',
  'like',
  'cock',
  'would',
  'woman',
  'get',
  'ass',
  'rape',
  'fuck',
  'want',
  'bitch',
  'sexual',
  'spank',
  'fucking',
  'gangbanged'],
 'Sexually explicit': ['bitch',
  'like',
  'dick',
  'look',
  'whore',
  'ass',
  'women',
  'fuck',
  'cunt',
  'pussy',
  'rape',
  'woman',
  'suck',
  'cock',
  'girl']}

In [46]:
with open('../data/extracted/keywords_en.json', 'w', encoding='utf-8') as f:
    json.dump(keywords_en, f, ensure_ascii=False, indent=4)

## Keyword Selection with Machine Learning 

To be implemented... (if required)