In [ ]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [ ]:
import re
import emoji
import string
import pandas as pd
import torch

from textblob import TextBlob
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Dataset

In [ ]:
data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')
# Set the maximum column width to a large number for all columns
pd.set_option('display.max_colwidth', None)
pd.reset_option('display.max_colwidth')

# Dataset preprocessing

1. Removing emojis, urls, usernames, duplicates, NaN values, hashtags
2. Lowercasing characters
3. Joining hashtags and keywords nad text together

In [ ]:
# Preproceanyssing function
def preprocess_text(df):
    # Make dataset lowercase
    df["text"] = df["text"].str.lower()

    # Create new table to pass their hashtags
    df["hashtags"] = df["text"].apply(extract_hashtags)

    # Remove hashtags
    df["text"] = df["text"].apply(remove_hashtags)

    # Remove url links
    df["text"] = df["text"].apply(remove_url_username)

    # Remove emojis
    df["text"] = df["text"].apply(remove_emojis)

    # Remove special characters
    df['text'] = df['text'].apply(remove_special_characters)

    # Remove duplicates
    df = df.drop_duplicates(subset=['text'], keep='first')

    df["combined_text"] = df.apply(append_hashtags_text, axis=1)

    # Remove NaN values from keywords
    df['keyword'] = data['keyword'].fillna('')

    # Concatenate combined_text and keyword
    df['final_text'] = np.where(
        df['keyword'].str.strip() != '',
        "Keyword: " + df['keyword'] + " " + df['combined_text'],
        df['combined_text']
    )

    return df


# Extract hashtags
def extract_hashtags(text):
    return re.findall(r"#(\w+)", text)

# Function to remove hashtags from text
def remove_hashtags(text):
    return re.sub(r"#(\w+)", "", text).strip()

def remove_url_username(text):
    url_pattern = r"http[s]?://\S+"
    return re.sub(url_pattern, "", text).strip()

# Remove emojis from text
def remove_emojis(text):
    return emoji.replace_emoji(text, "").strip()

# Remove special characters
def remove_special_characters(text):
    return text.translate(str.maketrans('', '', string.punctuation))

# Append hashtags to text
def append_hashtags_text(row):
    hashtags = ", ".join(row['hashtags'])  # Join list with spaces
    text = row['text'].strip()
    if hashtags:
        return f"Hashtags: {hashtags}. Text: {row['text']}"
    else:
        return f"Text: {text}"


data = preprocess_text(data)
test_data = preprocess_text(test_data)

## Splitting data

In [None]:
data = data[['final_text', 'target']].dropna()

# Split the data
train_texts, val_texts, train_labels, val_labels = train_test_split(
    data['final_text'].tolist(),
    data['target'].tolist(),
    test_size=0.2,
    random_state=42
)

In [ ]:
test_data = test_data[['id', 'final_text']].dropna()

In [ ]:
# Load the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['final_text'], padding="max_length", truncation=True, max_length=128)

# Save it as a Dataset to load then into the model
train_dataset = Dataset.from_dict({'text': train_texts, 'label': train_labels}).map(tokenize_function, batched=True)
val_dataset = Dataset.from_dict({'text': val_texts, 'label': val_labels}).map(tokenize_function, batched=True)


In [ ]:
# Deactivate weights & biases
os.environ["WANDB_DISABLED"] = "true"

In [ ]:
# Load the model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch"
)

In [ ]:
# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)

# Train the model
trainer.train()

In [ ]:
# Evaluate the model on validation set
trainer.evaluate()

In [ ]:
# We need convert ids to string to put on a table
test_data['id'] = test_data['id'].astype(str)

# Taking final_text and id
test_texts = test_data['final_text'].tolist()
test_ids = test_data['id'].tolist()

# Tokenize the test data
test_dataset = Dataset.from_dict({'id': test_ids, 'text': test_texts}).map(tokenize_function, batched=True)

# Get predictions
predictions = trainer.predict(test_dataset)
predicted_labels = predictions.predictions.argmax(-1)

results_df = pd.DataFrame({
    'id': test_ids,
    'target': predicted_labels,
})

print(results_df.head())

results_df.to_csv('predictions_sent_first_s.csv', index=False)