In [42]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv


In [5]:
pip install emoji --upgrade

Note: you may need to restart the kernel to use updated packages.


In [None]:
!pip install transformers torch datasets

In [6]:
import re
import emoji
import string
from textblob import TextBlob

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch
from datasets import Dataset

# Dataset

In [43]:
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

In [8]:
# Set the maximum column width to a large number for all columns
pd.set_option('display.max_colwidth', None)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [46]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             3263 non-null   int64 
 1   keyword        3263 non-null   object
 2   location       2158 non-null   object
 3   text           3263 non-null   object
 4   hashtags       3263 non-null   object
 5   combined_text  3263 non-null   object
 6   final_text     3263 non-null   object
dtypes: int64(1), object(6)
memory usage: 178.6+ KB


# Dataset preprocessing

1. Removing emojis, urls, usernames, duplicates, NaN values, hashtags
2. Lowercasing characters
3. Joining hashtags and keywords nad text together

In [45]:
# Preproceanyssing function
def preprocess_text(df):
    # Make dataset lowercase
    df["text"] = df["text"].str.lower()
    
    # Create new table to pass their hashtags
    df["hashtags"] = df["text"].apply(extract_hashtags)
    
    # Remove hashtags
    df["text"] = df["text"].apply(remove_hashtags)
    
    # Remove url links
    df["text"] = df["text"].apply(remove_url_username)
    
    # Remove emojis
    df["text"] = df["text"].apply(remove_emojis)
    
    # Remove special characters
    df['text'] = df['text'].apply(remove_special_characters)
    
    # Remove duplicates
    #df = df.drop_duplicates(subset=['text'], keep='first')
    
    df["combined_text"] = df.apply(append_hashtags_text, axis=1)
    
    # Remove NaN values from keywords
    df['keyword'] = data['keyword'].fillna('')
    
    # Concatenate combined_text and keyword
    df['final_text'] = np.where(
        df['keyword'].str.strip() != '',
        "Keyword: " + df['keyword'] + " " + df['combined_text'],
        df['combined_text']
    )
    
    return df


# Extract hashtags
def extract_hashtags(text):
    return re.findall(r"#(\w+)", text)

# Function to remove hashtags from text
def remove_hashtags(text):
    return re.sub(r"#(\w+)", "", text).strip()

def remove_url_username(text):
    url_pattern = r"http[s]?://\S+"
    return re.sub(url_pattern, "", text).strip()

# Remove emojis from text
def remove_emojis(text):
    return emoji.replace_emoji(text, "").strip()

# Remove special characters
def remove_special_characters(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Append hashtags to text
def append_hashtags_text(row):
    hashtags = ", ".join(row['hashtags'])  # Join list with spaces
    text = row['text'].strip()
    if hashtags:
        return f"Hashtags: {hashtags}. Text: {row['text']}"
    else:
        return f"Text: {text}"


data = preprocess_text(data)
test_data = preprocess_text(test_data)


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 6871 entries, 0 to 7606
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          6871 non-null   int64 
 1   keyword     6871 non-null   object
 2   location    4616 non-null   object
 3   target      6871 non-null   int64 
 4   hashtags    6871 non-null   object
 5   final_text  6871 non-null   object
dtypes: int64(2), object(4)
memory usage: 633.8+ KB


In [13]:
pd.reset_option('display.max_colwidth')

# BERT model

In [19]:
data.head()

Unnamed: 0,id,keyword,location,target,hashtags,final_text
0,1,,,1,[earthquake],Hashtags: earthquake. Text: our deeds are the ...
1,4,,,1,[],Text: forest fire near la ronge sask canada
2,5,,,1,[],Text: all residents asked to shelter in place ...
3,6,,,1,[wildfires],Hashtags: wildfires. Text: 13000 people receiv...
4,7,,,1,"[alaska, wildfires]","Hashtags: alaska, wildfires. Text: just got se..."


## Splitting data

In [20]:
data = data[['final_text', 'target']].dropna()

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['final_text'].tolist(),
    data['target'].tolist(),
    test_size=0.2,
    random_state=42
)

## Load the model

In [None]:
test_data = test_data[['id', 'final_text']].dropna()

In [21]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['text'], padding="max_length", truncation=True, max_length=128)

train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels}).map(tokenize_function, batched=True)
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels}).map(tokenize_function, batched=True)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Map:   0%|          | 0/5496 [00:00<?, ? examples/s]

Map:   0%|          | 0/1375 [00:00<?, ? examples/s]

In [22]:
os.environ["WANDB_DISABLED"] = "true"

In [24]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


## Train the model

In [25]:
# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.4,0.462233
2,0.4518,0.451054
3,0.277,0.552684


TrainOutput(global_step=1032, training_loss=0.38063802056072293, metrics={'train_runtime': 238.0958, 'train_samples_per_second': 69.249, 'train_steps_per_second': 4.334, 'total_flos': 1084543770193920.0, 'train_loss': 0.38063802056072293, 'epoch': 3.0})

In [26]:
trainer.evaluate()

{'eval_loss': 0.5526835322380066,
 'eval_runtime': 5.4011,
 'eval_samples_per_second': 254.578,
 'eval_steps_per_second': 4.073,
 'epoch': 3.0}

In [49]:
test_data.isnull().all()

id            False
keyword       False
location      False
hashtags      False
final_text    False
dtype: bool

In [54]:
test_data.head()

Unnamed: 0,id,keyword,location,hashtags,final_text
0,0,,,[],Text: just happened a terrible car crash
1,2,,,[earthquake],Hashtags: earthquake. Text: heard about is di...
2,3,,,[],Text: there is a forest fire at spot pond gees...
3,9,,,"[spokane, wildfires]","Hashtags: spokane, wildfires. Text: apocalypse..."
4,11,,,[],Text: typhoon soudelor kills 28 in china and t...


In [55]:
test_data = test_data[['final_text']].dropna()

## Testing data

In [1]:
# We need convert ids to string to put on a table
test_data['id'] = test_data['id'].astype(str)

# Taking final_text and id
test_texts = test_data['final_text'].tolist()
test_ids = test_data['id'].tolist()

# Tokenize the test data
test_dataset = Dataset.from_dict({'id': test_ids, 'text': test_texts}).map(tokenize_function, batched=True)

# Get predictions
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

results_df = pd.DataFrame({
    'id': test_ids,
    'target': predicted_labels,
})

print(results_df.head())

results_df.to_csv('predictions_sent_first_s.csv', index=False)

NameError: name 'test_data' is not defined

In [48]:
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          3263 non-null   int64 
 1   final_text  3263 non-null   object
dtypes: int64(1), object(1)
memory usage: 51.1+ KB
