<a href="https://colab.research.google.com/github/MonMon120/test/blob/main/CNN_task_A.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout
from sklearn.model_selection import train_test_split

In [None]:
# Load the data from CSV
df = pd.read_csv('/content/drive/MyDrive/datasets/edos_labelled_aggregated.csv')
df.head()

Unnamed: 0,rewire_id,text,label_sexist,label_category,label_vector,split
0,sexism2022_english-9609,"In Nigeria, if you rape a woman, the men rape ...",not sexist,none,none,dev
1,sexism2022_english-16993,"Then, she's a keeper. 😉",not sexist,none,none,train
2,sexism2022_english-13149,This is like the Metallica video where the poo...,not sexist,none,none,train
3,sexism2022_english-13021,woman?,not sexist,none,none,train
4,sexism2022_english-966,I bet she wished she had a gun,not sexist,none,none,dev


In [None]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [None]:
# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def clean_text(text):
    # Remove punctuation and numbers
    text = re.sub(r'[^\w\s]', '', text)
    # Convert to lowercase
    text = text.lower()
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    return cleaned_text

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:
dummy = pd.get_dummies(df['label_sexist'])

# remove unnecessary columns
df2 = pd.concat((df, dummy), axis=1)
df2.drop(['label_sexist', 'not sexist', 'label_vector','rewire_id', 'label_category','split'], axis=1, inplace=True)
df2.rename(columns={'sexist':'label_sexist'}, inplace=True)

df2.head()

Unnamed: 0,text,label_sexist
0,"In Nigeria, if you rape a woman, the men rape ...",False
1,"Then, she's a keeper. 😉",False
2,This is like the Metallica video where the poo...,False
3,woman?,False
4,I bet she wished she had a gun,False


In [None]:
df2['cleaned_text'] = df2['text'].apply(clean_text)

# Display the cleaned dataset
print("Original Dataset:")
print(df2[['text', 'label_sexist']].head())
print("\nCleaned Dataset:")
print(df2[['cleaned_text', 'label_sexist']].head())

Original Dataset:
                                                text  label_sexist
0  In Nigeria, if you rape a woman, the men rape ...         False
1                            Then, she's a keeper. 😉         False
2  This is like the Metallica video where the poo...         False
3                                             woman?         False
4                     I bet she wished she had a gun         False

Cleaned Dataset:
                                        cleaned_text  label_sexist
0  nigeria rape woman men rape back nsfw nigeria ...         False
1                                        shes keeper         False
2  like metallica video poor mutilated bastard sa...         False
3                                              woman         False
4                                     bet wished gun         False


In [None]:
df2.head()

Unnamed: 0,text,label_sexist,cleaned_text
0,"In Nigeria, if you rape a woman, the men rape ...",False,nigeria rape woman men rape back nsfw nigeria ...
1,"Then, she's a keeper. 😉",False,shes keeper
2,This is like the Metallica video where the poo...,False,like metallica video poor mutilated bastard sa...
3,woman?,False,woman
4,I bet she wished she had a gun,False,bet wished gun


In [None]:
# Preprocess the text data

texts = df2['cleaned_text'].values
labels = df2['label_sexist'].values

max_words = 10000
maxlen = 100
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
data = pad_sequences(sequences, maxlen=maxlen)

# Split the data into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(data, labels, test_size=0.2, random_state=42)

# Define the CNN model
embedding_dim = 100
vocab_size = min(max_words, len(word_index) + 1)

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(Conv1D(128, 5, activation='relu'))
model.add(GlobalMaxPooling1D())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])


In [None]:
# Train
history = model.fit(x_train, y_train,
                    epochs= 2,
                    batch_size= 120,
                    validation_data=(x_val, y_val), shuffle=True)

# Evaluate
loss, accuracy = model.evaluate(x_val, y_val)
print("Validation Loss:", loss)
print("Validation Accuracy:", accuracy)


Epoch 1/2
Epoch 2/2
Validation Loss: 0.38790345191955566
Validation Accuracy: 0.8347499966621399
