<a href="https://colab.research.google.com/github/MonMon120/test/blob/main/CNN_task_B.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split

In [None]:
# Load the data from CSV
df = pd.read_csv('/content/drive/MyDrive/datasets/edos_labelled_aggregated.csv')
df.head()

Unnamed: 0,rewire_id,text,label_sexist,label_category,label_vector,split
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,none,none,dev
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,none,none,train
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,none,none,train
3,sexism2022_english-13021,woman?,not sexist,none,none,train
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,none,none,dev


In [None]:
df['label_category'].unique()

array(['none', '3. animosity', '2. derogation',
       '4. prejudiced discussions',
       '1. threats, plans to harm and incitement'], dtype=object)

In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from tensorflow.keras.utils import to_categorical

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


In [None]:
# Change the text labels to numbers
label_mapping = {
    'none': 0,
    '3. animosity': 3,
    '2. derogation': 2,
    '4. prejudiced discussions': 4,
    '1. threats, plans to harm and incitement': 1

}

df['numeric_labels'] = df['label_category'].map(label_mapping)

df.head()

Unnamed: 0,rewire_id,text,label_sexist,label_category,label_vector,split,numeric_labels
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,none,none,dev,0
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,none,none,train,0
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,none,none,train,0
3,sexism2022_english-13021,woman?,not sexist,none,none,train,0
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,none,none,dev,0


In [None]:
# remove unnecessary columns
df2 = pd.concat((df['text'], df['numeric_labels']), axis=1)
df2.rename(columns={'numeric_labels':'label_sexist'}, inplace=True)

df2.head()

Unnamed: 0,text,label_sexist
0,"In Nigeria, if you rape a woman, the men rape ...",0
1,"Then, she's a keeper. 😉",0
2,This is like the Metallica video where the poo...,0
3,woman?,0
4,I bet she wished she had a gun,0


In [None]:
df2['label_sexist'].unique()

array([0, 3, 2, 4, 1])

In [None]:
df2['cleaned_text'] = df2['text'].apply(clean_text)

# Display the cleaned dataset
print("Original Dataset:")
print(df2[['text', 'label_sexist']].head())
print("\nCleaned Dataset:")
print(df2[['cleaned_text', 'label_sexist']].head())

Original Dataset:
                                                text  label_sexist
0  In Nigeria, if you rape a woman, the men rape ...             0
1                            Then, she's a keeper. 😉             0
2  This is like the Metallica video where the poo...             0
3                                             woman?             0
4                     I bet she wished she had a gun             0

Cleaned Dataset:
                                        cleaned_text  label_sexist
0  nigeria rape woman men rape back nsfw nigeria ...             0
1                                        shes keeper             0
2  like metallica video poor mutilated bastard sa...             0
3                                              woman             0
4                                     bet wished gun             0


In [None]:
df2.head()

Unnamed: 0,text,label_sexist,cleaned_text
0,"In Nigeria, if you rape a woman, the men rape ...",0,nigeria rape woman men rape back nsfw nigeria ...
1,"Then, she's a keeper. 😉",0,shes keeper
2,This is like the Metallica video where the poo...,0,like metallica video poor mutilated bastard sa...
3,woman?,0,woman
4,I bet she wished she had a gun,0,bet wished gun


In [None]:
# Preprocess the text data
texts = df2['cleaned_text'].values
labels = df2['label_sexist'].values

max_words = 10000
maxlen = 100  # Maximum sequence length

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=maxlen)

labels = to_categorical(labels, num_classes=5)

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

# Define CNN model with multi-class output
embedding_dim = 100
vocab_size = min(max_words, len(word_index) + 1)

# Define the CNN model
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='softmax'))

# Compile the model
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])


In [None]:
# Train
history = model.fit(x_train, y_train,
                    epochs=3,
                    batch_size=120,
                    validation_data=(x_val, y_val))

# Evaluate
loss, accuracy = model.evaluate(x_val, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Validation Loss: 0.7164257764816284
Validation Accuracy: 0.7799999713897705
