In [1]:


import pandas as pd

# Load datasets
train_data = pd.read_csv('data/nlp-getting-started/train.csv')

test_data = pd.read_csv('data/nlp-getting-started/test.csv')

# Display the first few rows of the training data
print(train_data.head())


   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


In [2]:
train_data.tail()

Unnamed: 0,id,keyword,location,text,target
7608,10869,,,Two giant cranes holding a bridge collapse int...,1
7609,10870,,,@aria_ahrary @TheTawniest The out of control w...,1
7610,10871,,,M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...,1
7611,10872,,,Police investigating after an e-bike collided ...,1
7612,10873,,,The Latest: More Homes Razed by Northern Calif...,1


In [5]:
df = train_data[["id", "text", "target"]]

In [16]:
df.head(4)

Unnamed: 0,id,text,target
0,542,WWI WWII JAPANESE ARMY NAVY MILITARY JAPAN LEA...,0
1,2026,@LasVegasLocally @VitalVegas They reined it in...,0
2,1138,@realhotcullen I agree but I knew we'd be goin...,0
3,7076,http://t.co/HFqlwo1kMy E-Mini SP 500: Earnings...,0


## resampling

In [7]:
import pandas as pd
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler




# Append minority data to the original dataframe

# Check initial class distribution
print("Initial class distribution:\n", df['target'].value_counts())

# Count the number of instances for each label
class_counts = df['target'].value_counts()
print("\nClass counts:\n", class_counts)

# Determine the size of the smaller class
min_class_size = class_counts.min()
print("\nSize of the smaller class:", min_class_size)


Initial class distribution:
 target
0    4342
1    3271
Name: count, dtype: int64

Class counts:
 target
0    4342
1    3271
Name: count, dtype: int64

Size of the smaller class: 3271


In [9]:
min_class_size = df['target'].value_counts().min()

# Create a balanced subset
balanced_subset = df.groupby('target').apply(lambda x: x.sample(min_class_size)).reset_index(drop=True)

# Check the balanced subset distribution
print("\nBalanced subset distribution:\n", balanced_subset['target'].value_counts())


Balanced subset distribution:
 target
0    3271
1    3271
Name: count, dtype: int64


In [10]:
balanced_subset

Unnamed: 0,id,text,target
0,542,WWI WWII JAPANESE ARMY NAVY MILITARY JAPAN LEA...,0
1,2026,@LasVegasLocally @VitalVegas They reined it in...,0
2,1138,@realhotcullen I agree but I knew we'd be goin...,0
3,7076,http://t.co/HFqlwo1kMy E-Mini SP 500: Earnings...,0
4,4493,I'm in the shower and I went to go change the ...,0
...,...,...,...
6537,8704,4 equipment ego break upon dig your family int...,1
6538,7535,Refugio oil spill may have been costlier bigge...,1
6539,10074,RT_America: RT RT_com: Eye of Super Typhoon So...,1
6540,5389,Found this cool photo not mine 1952 Dodge Wayn...,1


In [15]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
df = balanced_subset[["id", "text", "target"]]
# Preprocessing
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])  # Use 'v2' for messages
y = df['target']  # Use 'v1' for labels now containing 0 and 1

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, random_state=42)

# Logistic Regression Model
model = SVC(probability=True, random_state=42) # Increased max_iter for convergence
model.fit(X_train, y_train)

# Predictions and Evaluation
y_pred = model.predict(X_test)

# Accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.4f}')

# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)

# Classification Report
print("Classification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.7896
Confusion Matrix:
 [[147  21]
 [ 48 112]]
Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.88      0.81       168
           1       0.84      0.70      0.76       160

    accuracy                           0.79       328
   macro avg       0.80      0.79      0.79       328
weighted avg       0.80      0.79      0.79       328



In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

# Ensure that the test dataset has the same preprocessing steps applied
# Initialize the TF-IDF Vectorizer again to fit on the training set
vectorizer = TfidfVectorizer(stop_words='english')

# Preprocess the training data to fit the vectorizer
df = train_data[["id", "text", "target"]]
X_train = vectorizer.fit_transform(df['text'])  # Fit the vectorizer on training text
y_train = df['target']

# Train the model as before
model = LogisticRegression(max_iter=1000)  # Increased max_iter for convergence
model.fit(X_train, y_train)

# Preprocess the test data
X_test = vectorizer.transform(test_data['text'])  # Use the same vectorizer to transform the test text

# Make predictions on the test set
predictions = model.predict(X_test)

# Prepare the submission DataFrame
submission = pd.DataFrame({
    'id': test_data['id'],
    'target': predictions
})

# Save the submission file
submission.to_csv('submission_logreg.csv', index=False)

print("Submission file 'submission.csv' created successfully.")


Submission file 'submission.csv' created successfully.


In [12]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the dataset
df = train_data[["id", "text", "target"]]

# Preprocess the data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])  # Fit on training text
y = df['target']  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42)

# List of models to evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=50, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Support Vector Machine': SVC(probability=True, random_state=42),
    'Multinomial Naive Bayes': MultinomialNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(),
}

# Train each model and evaluate
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f'{name} Accuracy: {accuracy:.4f}')
    print(f'Classification Report for {name}:\n', classification_report(y_val, y_pred))
    print(f'Confusion Matrix for {name}:\n', confusion_matrix(y_val, y_pred))
    print('-' * 60)


Random Forest Accuracy: 0.7718
Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.74      0.94      0.82      1748
           1       0.86      0.55      0.67      1298

    accuracy                           0.77      3046
   macro avg       0.80      0.74      0.75      3046
weighted avg       0.79      0.77      0.76      3046

Confusion Matrix for Random Forest:
 [[1635  113]
 [ 582  716]]
------------------------------------------------------------
Gradient Boosting Accuracy: 0.7489
Classification Report for Gradient Boosting:
               precision    recall  f1-score   support

           0       0.71      0.94      0.81      1748
           1       0.86      0.49      0.63      1298

    accuracy                           0.75      3046
   macro avg       0.78      0.72      0.72      3046
weighted avg       0.77      0.75      0.73      3046

Confusion Matrix for Gradient Boosting:
 [[1640  108]
 [ 657  641]]

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7738
Classification Report for XGBoost:
               precision    recall  f1-score   support

           0       0.76      0.89      0.82      1748
           1       0.81      0.61      0.70      1298

    accuracy                           0.77      3046
   macro avg       0.78      0.75      0.76      3046
weighted avg       0.78      0.77      0.77      3046

Confusion Matrix for XGBoost:
 [[1564  184]
 [ 505  793]]
------------------------------------------------------------
[LightGBM] [Info] Number of positive: 1973, number of negative: 2594
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.006027 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4565
[LightGBM] [Info] Number of data points in the train set: 4567, number of used features: 353
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.432012 -> initscore=-0.273646
[LightGBM] [Info] Start training from score -0.273646
L

In [15]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load the training dataset
df = train_data[["id", "text", "target"]]

# Preprocess the data
vectorizer = TfidfVectorizer(stop_words='english')
X = vectorizer.fit_transform(df['text'])  # Fit on training text
y = df['target']  # Target variable

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# List of models to evaluate
models = {
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42),
    'Support Vector Machine': SVC(probability=True, random_state=42),
    'Multinomial Naive Bayes': MultinomialNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier(),
}

# Load the test dataset
X_test = vectorizer.transform(test_data['text'])  # Use the same vectorizer to transform the test text

# Train each model, evaluate, and create submission files
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    accuracy = accuracy_score(y_val, y_pred)
    print(f'{name} Accuracy: {accuracy:.4f}')
    
    # Print classification report and confusion matrix for validation set
    print(f'Classification Report for {name}:\n', classification_report(y_val, y_pred))
    print(f'Confusion Matrix for {name}:\n', confusion_matrix(y_val, y_pred))
    print('-' * 60)

    # Make predictions on the test set
    test_predictions = model.predict(X_test)

    # Prepare the submission DataFrame
    submission = pd.DataFrame({
        'id': test_data['id'],
        'target': test_predictions
    })

    # Save the submission file with the model name
    submission_file_name = f'submission_{name.replace(" ", "_").lower()}.csv'
    submission.to_csv(submission_file_name, index=False)
    print(f"Submission file '{submission_file_name}' created successfully.")


Random Forest Accuracy: 0.7735
Classification Report for Random Forest:
               precision    recall  f1-score   support

           0       0.75      0.91      0.82       874
           1       0.83      0.58      0.69       649

    accuracy                           0.77      1523
   macro avg       0.79      0.75      0.75      1523
weighted avg       0.78      0.77      0.76      1523

Confusion Matrix for Random Forest:
 [[799  75]
 [270 379]]
------------------------------------------------------------
Submission file 'submission_random_forest.csv' created successfully.
Gradient Boosting Accuracy: 0.7551
Classification Report for Gradient Boosting:
               precision    recall  f1-score   support

           0       0.72      0.93      0.81       874
           1       0.85      0.51      0.64       649

    accuracy                           0.76      1523
   macro avg       0.79      0.72      0.73      1523
weighted avg       0.78      0.76      0.74      1523

Co

Parameters: { "use_label_encoder" } are not used.



XGBoost Accuracy: 0.7649
Classification Report for XGBoost:
               precision    recall  f1-score   support

           0       0.75      0.88      0.81       874
           1       0.79      0.61      0.69       649

    accuracy                           0.76      1523
   macro avg       0.77      0.75      0.75      1523
weighted avg       0.77      0.76      0.76      1523

Confusion Matrix for XGBoost:
 [[768 106]
 [252 397]]
------------------------------------------------------------
Submission file 'submission_xgboost.csv' created successfully.
[LightGBM] [Info] Number of positive: 2622, number of negative: 3468
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.008591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 7341
[LightGBM] [Info] Number of data points in the train set: 6090, number of used features: 551
[LightGBM] [

In [None]:
%pip install pandas scikit-learn transformers torch


In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
import torch


import pandas as pd

# Load datasets
train_data = pd.read_csv('data/nlp-getting-started/train.csv')

test_data = pd.read_csv('data/nlp-getting-started/test.csv')

# Display the first few rows of the training data
print(train_data.head())

df = train_data[["id", "text", "target"]]

# Preprocess the data
df['target'] = df['target'].astype(int)  # Ensure target is of type int

# Split the dataset
X_train, X_val, y_train, y_val = train_test_split(df['text'], df['target'], test_size=0.2, random_state=42)

# Load the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=128)

# Create dataset objects
class DisasterTweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = DisasterTweetsDataset(train_encodings, y_train.to_numpy())
val_dataset = DisasterTweetsDataset(val_encodings, y_val.to_numpy())

# Load the BERT model for sequence classification
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

# Set up the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy='epoch',
)

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)

# Train the model
trainer.train()

# Evaluate the model
predictions = trainer.predict(val_dataset)
y_pred = torch.argmax(predictions.predictions, axis=1)

# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print(f'Validation Accuracy: {accuracy:.4f}')
print('Classification Report:\n', classification_report(y_val, y_pred))

# Load and preprocess the test data
test_data = pd.read_csv('test.csv')  # Adjust path as needed
test_encodings = tokenizer(list(test_data['text']), truncation=True, padding=True, max_length=128)

# Create test dataset
test_dataset = DisasterTweetsDataset(test_encodings, [0] * len(test_data))  # Dummy labels

# Make predictions on the test set
test_predictions = trainer.predict(test_dataset)
test_pred_labels = torch.argmax(test_predictions.predictions, axis=1)

# Prepare submission file
submission = pd.DataFrame({
    'id': test_data['id'],
    'target': test_pred_labels.numpy()
})

# Save submission file
submission.to_csv('submission_bert.csv', index=False)
print("Submission file 'submission_bert.csv' created successfully.")


   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['target'] = df['target'].astype(int)  # Ensure target is of type int
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/288 [00:00<?, ?it/s]