## Reading the packages

In [1]:
import pandas as pd
import re # remove links
import unidecode
from nltk.corpus import stopwords
from alive_progress import alive_bar
from openpyxl import load_workbook

# 1. Xenophobia classifier

## Importing the Hate Speech words

In [22]:
def list_cleaning(word_list): 
    word_list = [x for x in word_list if str(x) != 'nan'] #remove nan
    word_list = [x.lower() for x in word_list] #to lowercase
    word_list_noaccent = [unidecode.unidecode(x) for x in word_list] #remove accents
    word_list = word_list +  word_list_noaccent #add the list without accents to the original list
    word_list = list(dict.fromkeys(word_list)) #remove duplicates

    return(word_list)

col_insults = pd.read_excel('./../../../Scripts VMASC/Social Network Inform Files/insultos_colombiamagica_gender.xlsx')

men_insults = col_insults['insults_man'].tolist() + col_insults['insults_men'].tolist()
women_insults = col_insults['insults_woman'].tolist() + col_insults['insults_women'].tolist()
nogender_insults = col_insults['insults_no_gender'].tolist() + col_insults['insults_no_gender_plural'].tolist()

#Removing the terms veneco, venecos, veneca and venecas for just taking to account the insults to venezuelans
men_insults.remove('Veneco')
men_insults.remove('Venecos')
women_insults.remove('Veneca')
women_insults.remove('Venecas')

#Include the original version (Upper case) of the insults and the lowercase version
men_insults = men_insults + list_cleaning(men_insults)
women_insults = list_cleaning(women_insults)
nogender_insults = list_cleaning(nogender_insults)



## Functions pre-reading

In [23]:
# HS or not HS classify function
def xenophobia_classify_gender(data, men_comp_vect, women_comp_vect, nongender_comp_vect, colname = 'text'):
    
    total_insults_men = []
    total_insults_women = []
    total_insults_nogender = []
    total_insults = []
    HS = []
    HS_men = []
    HS_women = []
    with alive_bar(len(data), force_tty = True) as bar: 
        for text in data['text']:
            splitted_text = text.split()
            n_insults_men = 0; n_insults_women = 0; n_insults_nogender = 0
            for word in splitted_text:
                if word in men_comp_vect: n_insults_men += 1
                if word in women_comp_vect: n_insults_women += 1 
                if word in nongender_comp_vect: n_insults_nogender += 1 
            total_insults_men.append(n_insults_men)
            total_insults_women.append(n_insults_women)
            total_insults_nogender.append(n_insults_nogender)
            
            n_insults = n_insults_men+n_insults_women+n_insults_nogender
            total_insults.append(n_insults)
            
            HS_i = 0; HS_men_i = 0; HS_women_i= 0
            if n_insults>0: HS_i= 1
            if n_insults_men > n_insults_women:
                HS_men_i = 1
            elif n_insults_men < n_insults_women:
                HS_women_i = 1
            
            HS.append(HS_i)
            HS_men.append(HS_men_i)
            HS_women.append(HS_women_i)
            bar()
    data['n_insults_men'] = total_insults_men
    data['n_insults_women'] = total_insults_women
    data['n_insults_nogender'] = total_insults_nogender
    data['total_insults'] = total_insults
    data['HS'] = HS
    data['HS_men'] = HS_men
    data['HS_women'] = HS_women

    return(data)

In [24]:
# Statistic analysis function
def xenophobia_analysis_gender(dataframe, output = 'summary'):
    import statistics
    import pandas as pd
    total = sum(dataframe['total_insults'])
    men = sum(dataframe['n_insults_men'])
    prop_men = round(100*men/total,3)
    women = sum(dataframe['n_insults_women'])
    prop_women = round(100*women/total,3)
    nogender = sum(dataframe['n_insults_nogender'])
    prop_nogender = round(100*nogender/total,3)

    if output == 'summary':
        print('Total insults in tweets: ', total, sep = "")
        print('Total male insults in tweets:', men, " (",prop_men ," %)", sep = "")
        print('Total female insults in tweets:', women, " (",prop_women ," %)", sep = "")
        print('Total nogender insults in tweets:', nogender, " (",prop_nogender ," %)", sep = "")
        print('Total HS tweets:', sum(dataframe['HS']))
        print('Mean of HS tweets:', 100*statistics.mean(dataframe['HS']),'%')

    elif output == 'dataframe':
        df = [{'n_total_insults':total, 'n_men_insults':men, 'n_women_insults':women, 'n_nogender_insults':nogender,
         'prop_men_insults':prop_men, 'prop_women_insults':prop_women, 'prop_nogender_insults':prop_nogender}]
        return(pd.DataFrame.from_dict(df))
    

## Reading the cleaned tweets 

### (see Social Network Data Extraction and Combining.ipynb)

In [25]:
tweets_data0 = pd.read_csv('./../data/texts/colombian_valid_tweets.csv')
tweets_data0['text'] = tweets_data0['text'].astype(str).apply(lambda x: x.lower())

#cleaning
# 1. Links
def link_removal (string):
    return(re.sub('http[s]?://\S+', '', string))
tweets_data0['text'] = tweets_data0['text'].apply(link_removal)

# 2. mentions
def mention_removal (string):
    return(re.sub('@\S+', '', string))
tweets_data0['text'] = tweets_data0['text'].apply(mention_removal)

# 3. stop words
stop_words = set(stopwords.words('spanish'))
def remove_stopwords (string):
    words = string.strip()
    words = [w for w in words if not w.lower() in stop_words]
    return(' '.join(words))
tweets_data0['text_clean'] = tweets_data0['text'].apply(mention_removal)

## Results

In [26]:
result_data = xenophobia_classify_gender(tweets_data0, men_insults, women_insults, nogender_insults, colname = 'text_clean')

|████████████████████████████████████████| 1664903/1664903 [100%] in 3:23.5 (8181.63/s)                                  ▇▇▅ 150134/1664903 [9%] in 16s (9125.8/s, eta: 2:46)  ▁▃▅ 210350/1664903 [13%] in 24s (8687.6/s, eta: 2:48)  ▆▄▂ 581572/1664903 [35%] in 1:13 (8009.2/s, eta: 2:15)  ▂▄▆ 996355/1664903 [60%] in 2:04 (8044.8/s, eta: 1:23)  ▅▇▇ 1326247/1664903 [80%] in 2:44 (8097.2/s, eta: 42s)  ▇▇▅ 1519610/1664903 [91%] in 3:06 (8166.3/s, eta: 18s) 


In [27]:
xenophobia_analysis_gender(result_data, output='summary')

Total insults in tweets: 60636
Total male insults in tweets:28185 (46.482 %)
Total female insults in tweets:13795 (22.751 %)
Total nogender insults in tweets:18656 (30.767 %)
Total HS tweets: 53934
Mean of HS tweets: 3.2394680050429363 %


In [28]:
# Hate speech proportion by year
xenophobia_results_year = result_data.groupby(result_data['date'].map(lambda x: x[0:4])).mean() #result_data['referred_to']
# Hate speech proportion by month
xenophobia_results_month = result_data.groupby(result_data['date'].map(lambda x: x[0:7])).mean() #result_data['referred_to']

with pd.ExcelWriter('./../analysis/xenophobia_results.xlsx') as writer:
    xenophobia_results_year.to_excel(writer, sheet_name= 'year')
    xenophobia_results_month.to_excel(writer, sheet_name= 'month')

In [29]:
# Hate speech count by year
xenophobia_counts_year = result_data.groupby(result_data['date'].map(lambda x: x[0:4])).sum() #result_data['referred_to']
# Hate speech counts by month
xenophobia_counts_month = result_data.groupby(result_data['date'].map(lambda x: x[0:7])).sum() #result_data['referred_to']

path = './../analysis/xenophobia_results.xlsx'
book = load_workbook(path)

with pd.ExcelWriter(path, engine = 'openpyxl') as writer:
    writer.book = book
    xenophobia_counts_year.to_excel(writer, sheet_name= 'counts - year')
    xenophobia_counts_month.to_excel(writer, sheet_name= 'counts - month')

## Results for only the tweets with veneco

In [32]:
#remove the texts that do not contain 'veneco'
tweets_with_veneco = tweets_data0[tweets_data0['text'].str.contains('veneco')]

In [33]:
result_data_veneco = xenophobia_classify_gender(tweets_with_veneco, men_insults, women_insults, nogender_insults, colname = 'text_clean')

|████████████████████████████████████████| 560089/560089 [100%] in 1:06.3 (8443.54/s)                                   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['n_insults_men'] = total_insults_men
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['n_insults_women'] = total_insults_women
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['n_insults_nogender'] = total_insults_nogender
A value is trying to be set on a copy of a slice from a DataFrame

In [34]:
xenophobia_analysis_gender(result_data_veneco, output='summary')

Total insults in tweets: 45813
Total male insults in tweets:22325 (48.731 %)
Total female insults in tweets:8547 (18.656 %)
Total nogender insults in tweets:14941 (32.613 %)
Total HS tweets: 40450
Mean of HS tweets: 7.22206649300379 %


In [36]:
# Hate speech proportion by year
xenophobia_results_year_veneco = result_data_veneco.groupby(result_data_veneco['date'].map(lambda x: x[0:4])).mean() #result_data['referred_to']
# Hate speech proportion by month
xenophobia_results_month_veneco = result_data_veneco.groupby(result_data_veneco['date'].map(lambda x: x[0:7])).mean() #result_data['referred_to']


path = './../analysis/xenophobia_results.xlsx'
book = load_workbook(path)

with pd.ExcelWriter(path, engine = 'openpyxl') as writer:
    writer.book = book
    xenophobia_results_year_veneco.to_excel(writer, sheet_name= 'year - veneco')
    xenophobia_results_month_veneco.to_excel(writer, sheet_name= 'month - veneco')

In [37]:
# Hate speech count by year
xenophobia_counts_year_veneco = result_data_veneco.groupby(result_data_veneco['date'].map(lambda x: x[0:4])).sum() #result_data['referred_to']
# Hate speech counts by month
xenophobia_counts_month_veneco = result_data_veneco.groupby(result_data_veneco['date'].map(lambda x: x[0:7])).sum() #result_data['referred_to']

path = './../analysis/xenophobia_results.xlsx'
book = load_workbook(path)

with pd.ExcelWriter(path, engine = 'openpyxl') as writer:
    writer.book = book
    xenophobia_counts_year_veneco.to_excel(writer, sheet_name= 'year counts - veneco')
    xenophobia_counts_month_veneco.to_excel(writer, sheet_name= 'month counts - veneco')

## Results for only the tweets with veneca

In [38]:
#remove the texts that do not contain 'veneca'
tweets_with_veneca = tweets_data0[tweets_data0['text'].str.contains('veneca')]

In [39]:
result_data_veneca = xenophobia_classify_gender(tweets_with_veneca, men_insults, women_insults, nogender_insults, colname = 'text_clean')

|████████████████████████████████████████| 157057/157057 [100%] in 15.4s (10205.06/s)                                   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['n_insults_men'] = total_insults_men
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['n_insults_women'] = total_insults_women
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['n_insults_nogender'] = total_insults_nogender
A value is trying to be set on a copy of a slice from a DataFrame

In [40]:
xenophobia_analysis_gender(result_data_veneca, output='summary')

Total insults in tweets: 7448
Total male insults in tweets:1703 (22.865 %)
Total female insults in tweets:4232 (56.821 %)
Total nogender insults in tweets:1513 (20.314 %)
Total HS tweets: 6642
Mean of HS tweets: 4.229037865233641 %


In [41]:
# Hate speech proportion by year
xenophobia_results_year_veneca = result_data_veneca.groupby(result_data_veneca['date'].map(lambda x: x[0:4])).mean() #result_data['referred_to']
# Hate speech proportion by month
xenophobia_results_month_veneca = result_data_veneca.groupby(result_data_veneca['date'].map(lambda x: x[0:7])).mean() #result_data['referred_to']


path = './../analysis/xenophobia_results.xlsx'
book = load_workbook(path)

with pd.ExcelWriter(path, engine = 'openpyxl') as writer:
    writer.book = book
    xenophobia_results_year_veneca.to_excel(writer, sheet_name= 'year - veneca')
    xenophobia_results_month_veneca.to_excel(writer, sheet_name= 'month - veneca')

In [42]:
# Hate speech count by year
xenophobia_counts_year_veneca = result_data_veneca.groupby(result_data_veneca['date'].map(lambda x: x[0:4])).sum() #result_data['referred_to']
# Hate speech counts by month
xenophobia_counts_month_veneca = result_data_veneca.groupby(result_data_veneca['date'].map(lambda x: x[0:7])).sum() #result_data['referred_to']

path = './../analysis/xenophobia_results.xlsx'
book = load_workbook(path)

with pd.ExcelWriter(path, engine = 'openpyxl') as writer:
    writer.book = book
    xenophobia_counts_year_veneca.to_excel(writer, sheet_name= 'year counts - veneca')
    xenophobia_counts_month_veneca.to_excel(writer, sheet_name= 'month counts - veneca')

---

#### Hate speech counts 2020-03

In [120]:
df = result_data
df = df[(df['date'] >= '2020-03-01') & (df['date'] <= '2020-03-31')]
xenophobia_counts_2020_03 = df.groupby([df['referred_to'],df['date']]).sum()

In [121]:
from openpyxl import load_workbook
path = './../analysis/xenophobia_results_divided.xlsx'
book = load_workbook(path)

with pd.ExcelWriter(path, engine = 'openpyxl') as writer:
    writer.book = book
    xenophobia_counts_2020_03.to_excel(writer, sheet_name= 'counts - 2020-03')

In [123]:
path = './../analysis/xenophobia_results_divided.xlsx'
book = load_workbook(path)

with pd.ExcelWriter(path, engine = 'openpyxl') as writer:
    writer.book = book
    df.to_excel(writer, sheet_name= '2020-03')

In [131]:
#!pyjsonviewer

C:\Users\joseph\anaconda3\lib\site-packages\pyjsonviewer\pyjsonviewer.py start!!


# For the VMASC MSVSCC 2023 paper

In [1]:
import pandas as pd
import re # remove links
import unidecode
from nltk.corpus import stopwords
from alive_progress import alive_bar

# 1. Xenophobia classifier

## Importing the Hate Speech words

In [8]:
def list_cleaning(word_list): 
    word_list = [x for x in word_list if str(x) != 'nan']
    word_list = [x.lower() for x in word_list] #to lowercase
    word_list_noaccent = [unidecode.unidecode(x) for x in word_list] 
    word_list = word_list +  word_list_noaccent
    
    hs_words = []
    [hs_words.append(x) for x in col_insults if x not in hs_words]
    return(word_list)

col_insults = pd.read_csv('./../../Social Network Inform Files/insultos_colombiamagica_gender.csv', encoding='iso-8859-1', sep = ';')

men_insults = col_insults['insults_man'].tolist() + col_insults['insults_men'].tolist()
women_insults = col_insults['insults_woman'].tolist() + col_insults['insults_women'].tolist()
nogender_insults = col_insults['insults_no_gender'].tolist() + col_insults['insults_no_gender_plural'].tolist()

# men_insults = list_cleaning(men_insults)
# women_insults = list_cleaning(women_insults)
# nogender_insults = list_cleaning(nogender_insults)
men_insults = ['veneco']
women_insults = ['veneca']
nogender_insults = list_cleaning(['venecos', 'venecas'])

## Functions pre-reading

In [9]:
# HS or not HS classify function
def xenophobia_classify_gender(data, men_comp_vect, women_comp_vect, nongender_comp_vect, colname = 'text'):
    
    total_insults_men = []
    total_insults_women = []
    total_insults_nogender = []
    total_insults = []
    HS = []
    HS_men = []
    HS_women = []
    with alive_bar(len(data), force_tty = True) as bar: 
        for text in data['text']:
            splitted_text = text.split()
            n_insults_men = 0; n_insults_women = 0; n_insults_nogender = 0
            for word in splitted_text:
                if word in men_comp_vect: n_insults_men += 1
                if word in women_comp_vect: n_insults_women += 1 
                if word in nongender_comp_vect: n_insults_nogender += 1 
            total_insults_men.append(n_insults_men)
            total_insults_women.append(n_insults_women)
            total_insults_nogender.append(n_insults_nogender)
            
            n_insults = n_insults_men+n_insults_women+n_insults_nogender
            total_insults.append(n_insults)
            
            HS_i = 0; HS_men_i = 0; HS_women_i= 0
            if n_insults>0: HS_i= 1
            if n_insults_men > n_insults_women:
                HS_men_i = 1
            elif n_insults_men < n_insults_women:
                HS_women_i = 1
            
            HS.append(HS_i)
            HS_men.append(HS_men_i)
            HS_women.append(HS_women_i)
            bar()
    data['n_insults_men'] = total_insults_men
    data['n_insults_women'] = total_insults_women
    data['n_insults_nogender'] = total_insults_nogender
    data['total_insults'] = total_insults
    data['HS'] = HS
    data['HS_men'] = HS_men
    data['HS_women'] = HS_women

    return(data)

In [10]:
# Statistic analysis function
def xenophobia_analysis_gender(dataframe, output = 'summary'):
    import statistics
    import pandas as pd
    total = sum(dataframe['total_insults'])
    men = sum(dataframe['n_insults_men'])
    prop_men = round(100*men/total,3)
    women = sum(dataframe['n_insults_women'])
    prop_women = round(100*women/total,3)
    nogender = sum(dataframe['n_insults_nogender'])
    prop_nogender = round(100*nogender/total,3)

    if output == 'summary':
        print('Total insults in tweets: ', total, sep = "")
        print('Total male insults in tweets:', men, " (",prop_men ," %)", sep = "")
        print('Total female insults in tweets:', women, " (",prop_women ," %)", sep = "")
        print('Total nogender insults in tweets:', nogender, " (",prop_nogender ," %)", sep = "")
        print('Total HS tweets:', sum(dataframe['HS']))
        print('Mean of HS tweets:', 100*statistics.mean(dataframe['HS']),'%')

    elif output == 'dataframe':
        df = [{'n_total_insults':total, 'n_men_insults':men, 'n_women_insults':women, 'n_nogender_insults':nogender,
         'prop_men_insults':prop_men, 'prop_women_insults':prop_women, 'prop_nogender_insults':prop_nogender}]
        return(pd.DataFrame.from_dict(df))
    

## Reading the cleaned tweets 

### (see Social Network Data Extraction and Combining.ipynb)

In [11]:
tweets_data0 = pd.read_csv('./../data/texts/colombian_valid_tweets.csv')
tweets_data0['text'] = tweets_data0['text'].astype(str).apply(lambda x: x.lower())

#cleaning
# 1. Links
def link_removal (string):
    return(re.sub('http[s]?://\S+', '', string))
tweets_data0['text'] = tweets_data0['text'].apply(link_removal)

# 2. mentions
def mention_removal (string):
    return(re.sub('@\S+', '', string))
tweets_data0['text'] = tweets_data0['text'].apply(mention_removal)

# 3. stop words
stop_words = set(stopwords.words('spanish'))
def remove_stopwords (string):
    words = string.strip()
    words = [w for w in words if not w.lower() in stop_words]
    return(' '.join(words))
tweets_data0['text_clean'] = tweets_data0['text'].apply(mention_removal)

In [12]:
result_data = xenophobia_classify_gender(tweets_data0, men_insults, women_insults, nogender_insults, colname = 'text_clean')

|████████████████████████████████████████| 1664903/1664903 [100%] in 11.2s (148119.98/s)                                 ▁▃▅ 741708/1664903 [45%] in 5s (153026.8/s, eta: 6s) 


In [13]:
xenophobia_analysis_gender(result_data, output='summary')

Total insults in tweets: 614810
Total male insults in tweets:247465 (40.251 %)
Total female insults in tweets:104335 (16.97 %)
Total nogender insults in tweets:263010 (42.779 %)
Total HS tweets: 581930
Mean of HS tweets: 34.95278703924493 %


#### Hate speech proportion by year

In [14]:
xenophobia_results_year = result_data.groupby(result_data['date'].map(lambda x: x[0:4])).mean() #result_data['referred_to']

#### Hate speech proportion by month

In [15]:
xenophobia_results_month = result_data.groupby(result_data['date'].map(lambda x: x[0:7])).mean() #result_data['referred_to']

In [16]:
with pd.ExcelWriter('./../analysis/xenophobia_results_divided[VMASC CONFERENCE].xlsx') as writer:
    xenophobia_results_year.to_excel(writer, sheet_name= 'year')
    xenophobia_results_month.to_excel(writer, sheet_name= 'month')