### Info: This file is used for experiments and for creating functions in utils/preprocessing.py. To prepare all the data, please run milestone_1/run_preprocessing.py.
# Imports

In [1]:
import os
from stanza.utils.conll import CoNLL
import pandas as pd
import stanza
import nltk
import re
from nltk.corpus import stopwords

  from .autonotebook import tqdm as notebook_tqdm


# Download 

In [2]:
nltk.download('punkt')
nltk.download('stopwords')
stanza.download('en')
stanza.download('de')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/svengerloff/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/svengerloff/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 129MB/s]                     
2024-11-08 14:19:07 INFO: Downloaded file to /Users/svengerloff/stanza_resources/resources.json
2024-11-08 14:19:07 INFO: Downloading default packages for language: en (English) ...
2024-11-08 14:19:09 INFO: File exists: /Users/svengerloff/stanza_resources/en/default.zip
2024-11-08 14:19:11 INFO: Finished downloading models and saved to /Users/svengerloff/stanza_resources
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 101MB/s]                     
2024-11-08 14:19:11 INFO: Downloaded file to 

# Set Path

In [3]:
data_path = os.path.normpath(os.path.join(os.getcwd(), '..', 'data', 'edos_labelled_aggregated.csv'))
output_path_conllu = os.path.normpath(os.path.join(os.getcwd(), '..', 'data', 'output.conllu'))
output_path_df = os.path.normpath(os.path.join(os.getcwd(), '..', 'data', 'df.parquet'))

# Load Data

In [4]:
df = pd.read_csv(data_path)
df.head(5)

Unnamed: 0,rewire_id,text,label_sexist,label_category,label_vector,split
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,none,none,dev
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,none,none,train
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,none,none,train
3,sexism2022_english-13021,woman?,not sexist,none,none,train
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,none,none,dev


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   rewire_id       20000 non-null  object
 1   text            20000 non-null  object
 2   label_sexist    20000 non-null  object
 3   label_category  20000 non-null  object
 4   label_vector    20000 non-null  object
 5   split           20000 non-null  object
dtypes: object(6)
memory usage: 937.6+ KB


# Prepare df

In [6]:
df['label_sexist'] = df['label_sexist'].map({'sexist': 1, 'not sexist': 0})
df = df[["text","label_sexist","split"]]
df.rename(columns={'label_sexist': 'label'}, inplace=True)
df.head(2)

Unnamed: 0,text,label,split
0,"In Nigeria, if you rape a woman, the men rape ...",0,dev
1,"Then, she's a keeper. 😉",0,train


# Text preprocessing

In [7]:
stop_words = set(stopwords.words('english'))
nlp_pipeline = stanza.Pipeline('en', processors='tokenize,lemma,pos')

2024-11-08 14:19:18 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.9.0.json: 392kB [00:00, 145MB/s]                     
2024-11-08 14:19:18 INFO: Downloaded file to /Users/svengerloff/stanza_resources/resources.json
2024-11-08 14:19:18 INFO: Loading these models for language: en (English):
| Processor | Package           |
---------------------------------
| tokenize  | combined          |
| mwt       | combined          |
| pos       | combined_charlm   |
| lemma     | combined_nocharlm |

2024-11-08 14:19:18 INFO: Using device: cpu
2024-11-08 14:19:18 INFO: Loading: tokenize
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-11-08 14:19:19 INFO: Loading: mwt
  checkpoint = torch.load(filename, lambda storage, loc: storage)
2024-

In [8]:
data = {'text': ['I am learning data science.', 'Natural language processing is fun!']}
df_test = pd.DataFrame(data)

# Test with 10 rows
df = df.head(10)
def clean_text(text):
    return re.sub(r'\[USER\]', '', text).strip()

def process_text(text):
    cleaned_text = clean_text(text)
    
    doc = nlp_pipeline(cleaned_text)
    lemmas = []
    pos_tags = []
    
    for sentence in doc.sentences:
        for word in sentence.words:
            
            lemmas.append(word.lemma)
            pos_tags.append(word.upos)
    
    return doc, lemmas, pos_tags

docs, df['lemma'], df['pos'] = zip(*df['text'].apply(process_text))

with open(output_path_conllu, 'w', encoding='utf-8') as f:
    for doc in docs:
        CoNLL.write_doc2conll(doc, f)

# Save df and read it

In [9]:
df.to_parquet(output_path_df, index=False)
df_loaded = pd.read_parquet(output_path_df)
df_loaded.head(30)

Unnamed: 0,text,label,split,lemma,pos
0,"In Nigeria, if you rape a woman, the men rape ...",0,dev,"[in, Nigeria, ,, if, you, rape, a, woman, ,, t...","[ADP, PROPN, PUNCT, SCONJ, PRON, VERB, DET, NO..."
1,"Then, she's a keeper. 😉",0,train,"[then, ,, she, be, a, keeper, ., 😉]","[ADV, PUNCT, PRON, AUX, DET, NOUN, PUNCT, PUNCT]"
2,This is like the Metallica video where the poo...,0,train,"[this, be, like, the, Metallica, video, where,...","[PRON, AUX, ADP, DET, PROPN, NOUN, ADV, DET, A..."
3,woman?,0,train,"[woman, ?]","[NOUN, PUNCT]"
4,I bet she wished she had a gun,0,dev,"[I, bet, she, wish, she, have, a, gun]","[PRON, VERB, PRON, VERB, PRON, VERB, DET, NOUN]"
5,Unlicensed day care worker reportedly tells co...,0,train,"[Unlicensed, day, care, worker, reportedly, te...","[ADJ, NOUN, NOUN, NOUN, ADV, VERB, NOUN, PRON,..."
6,[USER] Leg day is easy. Hot girls who wear min...,1,train,"[leg, day, be, easy, ., hot, girl, who, wear, ...","[NOUN, NOUN, AUX, ADJ, PUNCT, ADJ, NOUN, PRON,..."
7,I don't know if you should avoid this one or e...,0,train,"[I, do, not, know, if, you, should, avoid, thi...","[PRON, AUX, PART, VERB, SCONJ, PRON, AUX, VERB..."
8,I get a new pussy every other week or whenever...,1,train,"[I, get, a, new, pussy, every, other, week, or...","[PRON, VERB, DET, ADJ, NOUN, DET, ADJ, NOUN, C..."
9,I agree with that but at the same time I know ...,1,dev,"[I, agree, with, that, but, at, the, same, tim...","[PRON, VERB, ADP, PRON, CCONJ, ADP, DET, ADJ, ..."


# Test Print 

In [10]:
with open(output_path_conllu, encoding='utf-8') as f:
    print(''.join(f.readlines()))

# text = In Nigeria, if you rape a woman, the men rape you back!
# sent_id = 0
1	In	in	ADP	IN	_	0	_	_	start_char=0|end_char=2
2	Nigeria	Nigeria	PROPN	NNP	Number=Sing	1	_	_	start_char=3|end_char=10|SpaceAfter=No
3	,	,	PUNCT	,	_	2	_	_	start_char=10|end_char=11
4	if	if	SCONJ	IN	_	3	_	_	start_char=12|end_char=14
5	you	you	PRON	PRP	Case=Nom|Person=2|PronType=Prs	4	_	_	start_char=15|end_char=18
6	rape	rape	VERB	VBP	Mood=Ind|Number=Sing|Person=2|Tense=Pres|VerbForm=Fin	5	_	_	start_char=19|end_char=23
7	a	a	DET	DT	Definite=Ind|PronType=Art	6	_	_	start_char=24|end_char=25
8	woman	woman	NOUN	NN	Number=Sing	7	_	_	start_char=26|end_char=31|SpaceAfter=No
9	,	,	PUNCT	,	_	8	_	_	start_char=31|end_char=32
10	the	the	DET	DT	Definite=Def|PronType=Art	9	_	_	start_char=33|end_char=36
11	men	man	NOUN	NNS	Number=Plur	10	_	_	start_char=37|end_char=40
12	rape	rape	VERB	VBP	Mood=Ind|Number=Plur|Person=3|Tense=Pres|VerbForm=Fin	11	_	_	start_char=41|end_char=45
13	you	you	PRON	PRP	Case=Acc|Person=2|PronType=Prs	1