In [1]:
!pip install pandas #if pandas is not already installed
import pandas as pd
import re # Regular Expressions
import string



In [2]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")
train_df.head() # displays the first five rows

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [3]:
#Defining a function for pre-processing
def preprocess_text(text):
    #to lowercase
    text = text.lower()
    #Remove links
    re.sub(r'http\S+|www.\S+', '',text )
    #Remove username
    re.sub(r'@\w+|#\w+','', text)
    #Remove non-ASCII Elements
    text.encode('ascii', 'ignore').decode('ascii')
    #Remove numbers
    re.sub(r'\d+', '', text)
    #Remove excess spaces
    re.sub(r'\s+', ' ', text).strip()
    #Removes punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

In [4]:
train_df['clean_text'] = train_df['text'].apply(preprocess_text)
test_df['clean_text'] = test_df['text'].apply(preprocess_text)
train_df[['text', 'clean_text']].head()

Unnamed: 0,text,clean_text
0,Our Deeds are the Reason of this #earthquake M...,our deeds are the reason of this earthquake ma...
1,Forest fire near La Ronge Sask. Canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,all residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",13000 people receive wildfires evacuation orde...
4,Just got sent this photo from Ruby #Alaska as ...,just got sent this photo from ruby alaska as s...


In [5]:
train_df.to_csv('train_clean.csv', index = False)
test_df.to_csv('test_clean.csv', index = False)

In [6]:
!pip install scikit_learn # if scikit learn is not already installed



In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(
    max_features = 5000,
    stop_words='english')
#Term Frequency X Inverse Document Frequency

In [8]:
X_train = tfidf.fit_transform(train_df['clean_text'])
X_test = tfidf.transform(test_df['clean_text'])

In [9]:
y_train = train_df['target']

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [11]:
X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42) # train-test split

In [12]:
lr_model = LogisticRegression(max_iter = 1000) # utilised Logistic Regression 
lr_model.fit(X_tr, y_tr)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,1000


In [13]:
y_pred = lr_model.predict(X_val)

In [14]:
accuracy = accuracy_score(y_val, y_pred)
print ("Achieved ", accuracy, " Percent accuracy")

Achieved  0.7984241628365069  Percent accuracy


In [15]:
print (classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       874
           1       0.82      0.67      0.74       649

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523



In [16]:
print (confusion_matrix(y_val, y_pred))

[[779  95]
 [212 437]]


In [17]:
!pip install torch datasets transformers 



In [18]:
import torch
from datasets import Dataset
from transformers import AutoTokenizer, Trainer, TrainingArguments, AutoModelForSequenceClassification

  from .autonotebook import tqdm as notebook_tqdm


In [19]:
#to convert these into Hugging Face datasets
train_dataset = Dataset.from_pandas(train_df[['clean_text', 'target']])
test_dataset = Dataset.from_pandas(test_df[['clean_text']])

In [20]:
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [21]:
def tokenize(batch):
    return tokenizer(batch['clean_text'], padding = 'max_length', truncation = True)



In [22]:
train_dataset = train_dataset.map(tokenize, batched = True)
test_dataset = test_dataset.map(tokenize, batched=True)

Map: 100%|██████████| 7613/7613 [00:00<00:00, 8142.43 examples/s]
Map: 100%|██████████| 3263/3263 [00:00<00:00, 9709.02 examples/s] 


In [23]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
!pip install --upgrade transformers


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [25]:
!pip install transformers[torch]
!pip install 'accelerate>=0.26.0'

zsh:1: no matches found: transformers[torch]


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [26]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    logging_dir='./logs',
    logging_steps=10,
    save_strategy='epoch'  # Speichert nach jeder Epoch
)


In [28]:
train_dataset = train_dataset.rename_column('target', 'labels')


In [29]:
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask'])

In [30]:
trainer = Trainer (
    args = training_args,
    model = model,
    train_dataset = train_dataset
)

In [31]:
trainer.train()



Step,Training Loss
10,0.6889
20,0.6157
30,0.5954
40,0.5634
50,0.5812
60,0.6138
70,0.5771
80,0.5772
90,0.6115
100,0.5691


Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG13GFamilyCommandBuffer: 0x306aa8030>
    label = <none> 
    device = <AGXG13GDevice: 0x16a035c00>
        name = Apple M1 
    commandQueue = <AGXG13GFamilyCommandQueue: 0x17684ec00>
        label = <none> 
        device = <AGXG13GDevice: 0x16a035c00>
            name = Apple M1 
    retainedReferences = 1
Error: command buffer exited with error status.
	The Metal Performance Shaders operations encoded on it may not have completed.
	Error: 
	(null)
	Insufficient Memory (00000008:kIOGPUCommandBufferCallbackErrorOutOfMemory)
	<AGXG13GFamilyCommandBuffer: 0x3059ab820>
    label = <none> 
    device = <AGXG13GDevice: 0x16a035c00>
        name = Apple M1 
    commandQueue = <AGXG13GFamilyCommandQueue: 0x17684ec00>
        label = <none> 
        device = <AGXG13GDev

TrainOutput(global_step=3808, training_loss=0.5192569092370006, metrics={'train_runtime': 19066.5608, 'train_samples_per_second': 0.799, 'train_steps_per_second': 0.2, 'total_flos': 4006128928911360.0, 'train_loss': 0.5192569092370006, 'epoch': 2.0})

In [32]:
test_dataset = test_dataset.map(tokenize, batched = True)

Map: 100%|██████████| 3263/3263 [00:00<00:00, 6155.01 examples/s]


In [33]:
preds = trainer.predict(test_dataset)
pred_labels = preds.predictions.argmax(-1)



In [34]:
submission = pd.DataFrame({
    "id": test_df["id"],
    "target": pred_labels
})

In [35]:
submission.to_csv("submission.csv", index = False)

In [36]:
submission.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,1
4,11,1
