# Twitter Disaster Detection Model 

In [42]:

! pip3 install -r requirements.txt
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
import random
import torch
from torch import nn, optim
import math
from IPython import display
import requests
import torch.nn.functional as F



### Pre-processing

In [2]:
###              EXAMPLE USE:                ###
## tweet_text = "This is a test tweet!"
## sentiment = sentiment_detection(tweet_text)
## sarcasm = sarcasm_detection(tweet_text) 
API_TOKEN = "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"

def query(API_URL, headers, payload):
    response = requests.post(API_URL, headers=headers, json=payload)
    return response.json()

def sentiment_detection(tweet_text):
    # Define the first API endpoint and function
    API_URL = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment-latest"
    headers = {"Authorization": f"Bearer {API_TOKEN}"}

    # Use the first function to query the sentiment of some text
    output_sentiment = query(API_URL, headers, {
        "inputs": tweet_text,
    })

    return output_sentiment


def sarcasm_detection(tweet_text):
    # Define the second API endpoint and function
    API_URL = "https://api-inference.huggingface.co/models/helinivan/english-sarcasm-detector"
    headers = {"Authorization": f"Bearer {API_TOKEN}"}
        
    output_sarcasm = query(API_URL, headers, {
        "inputs": tweet_text,
    })

    return output_sarcasm


In [3]:
#TESTING STUFF
print(sentiment_detection("I just loveeee stinky smelly stuff"))
print(sarcasm_detection("I just loveeee stinky smelly stuff"))
test_1 = sentiment_detection("I just loveeee stinky smelly stuff")
test_2 = sarcasm_detection("I just loveeee stinky smelly stuff")

[[{'label': 'neutral', 'score': 0.8819799423217773}, {'label': 'positive', 'score': 0.10076777637004852}, {'label': 'negative', 'score': 0.01725233532488346}]]
{'error': 'Model helinivan/english-sarcasm-detector is currently loading', 'estimated_time': 20.0}


In [38]:
print(test_1)
negative_score = test_1[0][0]['score']
neutral_score = test_1[0][1]['score']
positive_score = test_1[0][2]['score']


print(negative_score)
print(neutral_score)
print(positive_score)


[[{'label': 'negative', 'score': 0.6726911664009094}, {'label': 'neutral', 'score': 0.18366330862045288}, {'label': 'positive', 'score': 0.14364556968212128}]]
0.6726911664009094
0.18366330862045288
0.14364556968212128


In [41]:
print(test_2)
sarcastic = test_2[0][0]['score']
not_sarcastic = test_2[0][1]['score']


print(sarcastic)
print(not_sarcastic)

[[{'label': 'LABEL_0', 'score': 0.8405861854553223}, {'label': 'LABEL_1', 'score': 0.15941382944583893}]]
0.8405861854553223
0.15941382944583893


Add the api calls to the csv

In [44]:
import csv
import time # To add a delay between API calls

def process_tweets_1(input_file, output_file):
    # Open the input CSV file for reading and the output CSV file for writing
    with open(input_file, 'r', newline='') as file_in, open(output_file, 'w', newline='') as file_out:
        reader = csv.DictReader(file_in)

        # Define the fieldnames for the output CSV file
        fieldnames = reader.fieldnames + ['negative', 'neutral', 'positive', 'sarcastic', 'not_sarcastic']
        
        writer = csv.DictWriter(file_out, fieldnames=fieldnames)
        writer.writeheader()

        # Read and process each row in the input CSV file
        for row in reader:
            tweet_text = row['text']  # Assuming the tweet text is in the 'text' column

            # Call the sentiment detection and sarcasm detection functions
            sentiment_result = sentiment_detection(tweet_text)
            sarcasm_result = sarcasm_detection(tweet_text)


            # Add the sentiment and sarcasm probabilities to the row
            row['negative'] = sentiment_result[0][0]['score']
            row['neutral'] = sentiment_result[0][1]['score']
            row['positive'] = sentiment_result[0][2]['score']
            row['sarcastic'] = sarcasm_result[0][0]['score']
            row['not_sarcastic'] = sarcasm_result[0][1]['score']

            # Write the updated row to the output CSV file
            writer.writerow(row)
            
            # Delay for one minute before making the next API call
            time.sleep(60)
        

In [52]:
import csv
import time

MAX_RETRY_COUNT = 3
RATE_LIMIT_ERROR_MESSAGE = 'Rate limit reached. You reached free usage limit (reset hourly). Please subscribe to a plan at https://huggingface.co/pricing to use the API at this rate'

def process_tweets(input_file, output_file, log_file):
    # Open the input CSV file for reading, the output CSV file for writing, and the log CSV file for writing unsuccessful runs
    with open(input_file, 'r', newline='') as file_in, open(output_file, 'w', newline='') as file_out, open(log_file, 'w', newline='') as file_log:
        reader = csv.DictReader(file_in)

        # Define the fieldnames for the output CSV file
        fieldnames = reader.fieldnames + ['negative', 'neutral', 'positive', 'sarcastic', 'not_sarcastic']
        
        writer = csv.DictWriter(file_out, fieldnames=fieldnames)
        writer.writeheader()

        # Define the fieldnames for the log CSV file
        log_fieldnames = reader.fieldnames
        log_writer = csv.DictWriter(file_log, fieldnames=log_fieldnames)
        log_writer.writeheader()

        # Read and process each row in the input CSV file
        for row in reader:
            tweet_text = row['text']  # Assuming the tweet text is in the 'text' column
            retry_count = 0

            while retry_count < MAX_RETRY_COUNT:
                # Call the sentiment detection and sarcasm detection functions
                sentiment_result = sentiment_detection(tweet_text)
                sarcasm_result = sarcasm_detection(tweet_text)

                if sentiment_result and sarcasm_result:  # Check if results are not empty
                    if (isinstance(sentiment_result[0], dict) and 'error' in sentiment_result[0] and sentiment_result[0]['error'] == RATE_LIMIT_ERROR_MESSAGE) or (isinstance(sarcasm_result[0], dict) and 'error' in sarcasm_result[0] and sarcasm_result[0]['error'] == RATE_LIMIT_ERROR_MESSAGE):
                        # Log the row if rate limit error is encountered
                        log_writer.writerow(row)
                    else:
                        # Add the sentiment and sarcasm probabilities to the row
                        row['negative'] = sentiment_result[0][0]['score']
                        row['neutral'] = sentiment_result[0][1]['score']
                        row['positive'] = sentiment_result[0][2]['score']
                        row['sarcastic'] = sarcasm_result[0][0]['score']
                        row['not_sarcastic'] = sarcasm_result[0][1]['score']

                        # Write the updated row to the output CSV file
                        writer.writerow(row)
                    break
                else:
                    # Log the unsuccessful run
                    log_writer.writerow(row)

                    retry_count += 1
                    time.sleep(60)  # Delay for one minute before making the next retry

            if retry_count == MAX_RETRY_COUNT:
                # If maximum retry count reached, write the original row to the output CSV file
                writer.writerow(row)


In [53]:
input_csv_file = 'test.csv'
output_csv_file = 'preprocessed_test.csv'
log_csv_file = 'log.csv'

process_tweets(input_csv_file, output_csv_file, log_csv_file)

KeyError: 0

In [120]:
input_csv_file = 'train.csv'
output_csv_file = 'preprocessed_train.csv'

process_tweets(input_csv_file, output_csv_file)

NameError: name 'process_tweets' is not defined

### Device

In [137]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

### Import CSV's

In [163]:
train_df = pd.read_csv('preprocessed_train.csv', sep=',', encoding='latin-1')
test_df = pd.read_csv("test.csv")

In [164]:
data = train_df[['keyword', 'location', 'sarcastic', 'not_sarcastic', 'negative', 'neutral', 'positive']]
target = train_df['target']
#print(X_data["neutral"])
train_data, val_data, train_target, val_target = train_test_split(data, target, test_size = 0.2)
print(train_target)
# split the dataset into training and validation sets
#train_data, val_data = 
#print(train_data[:]["sarcastic"])
#train_loader = torch.utils.data.DataLoader(train_data, batch_size=batch_size, shuffle=True)
#val_loader = torch.utils.data.DataLoader(val_data, batch_size=batch_size, shuffle=True)


1731    0
2921    1
5415    0
7374    0
1958    1
       ..
5815    0
3817    0
2941    0
6775    0
6287    1
Name: target, Length: 6088, dtype: int64


### Convert categorical features in a DataFrame to one-hot encoding

In [165]:
def convert_features_to_one_hot(df, feature_name_list):
  for feature_name in feature_name_list:
    df = pd.get_dummies(df, columns=[feature_name])
  
  return df
#Define the training set/test set from the imported data... x_train, x_val, etc... needs to be predefined

data_to_convert = ['train_data', 'val_data']

feature_list = ['location', 'keyword']
for i,ix in enumerate(data_to_convert):
  exec(f'{data_to_convert[i]} = convert_features_to_one_hot({ix}, {feature_list})')


In [166]:
print(len(train_data.keys()))
input_layers = len(train_data.keys())

2959


### Model draft 

In [167]:
class netmodel(nn.Module):
  def __init__(self, input_layer=1, num_hidden=1, node_per_hidden=32, droppout=0., LSTM_layers=0, outputs=2):
    super(netmodel, self).__init__()
    self.input_layer = input_layer
    self.num_hidden = num_hidden 
    self.node_per_hidden = node_per_hidden
    self.droppout = droppout 
    self.SLTM_layers = LSTM_layers 
    self.outputs = outputs 
    self.inputfc = nn.Linear(input_layer, node_per_hidden) 
    self.hiddenfc = [] 
    for i in range(num_hidden-1):
      self.hiddenfc.append(nn.Linear(node_per_hidden, node_per_hidden))
    self.lastfc = nn.Linear(node_per_hidden, outputs)

  def forward(self, x, debug=False):
    drop = nn.Dropout(p=self.droppout)
    #x = x.view(1,1)
    x = self.inputfc(x)
    x = F.relu(x)
    x = drop(x)
    for i in range(self.num_hidden-1):
      x = self.hiddenfc[i](x)
      x = F.relu(x)
      x = drop(x)
    
    x = self.lastfc(x)
    x = F.softmax(x, dim=1)
    return x 



In [168]:
Model = netmodel(input_layer=input_layers, num_hidden=3, droppout=0.3).to(device)

#Model.forward(torch.tensor([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.]]))

In [208]:


batch_size = 32
print(len(train_data))
print(len(train_target))
#tensor_train = torch.utils.data.TensorDataset(train_data, train_target)
tensor_train = torch.tensor(train_data.values.astype(np.float), dtype=torch.float32).to(device)
tensor_train_labels = torch.tensor(train_target.values.astype(np.float), dtype=torch.float32).to(device)
tensor_train_set = torch.utils.data.TensorDataset(tensor_train, tensor_train_labels)

# Convert validation data to PyTorch tensors and move to device
#tensor_val = torch.utils.data.TensorDataset(val_data, val_target)
tensor_val = torch.tensor(val_data.values.astype(np.float), dtype=torch.float32).to(device)
tensor_val_labels = torch.tensor(val_target.values.astype(np.float), dtype=torch.float32).to(device)
tensor_val_set = torch.utils.data.TensorDataset(tensor_val, tensor_val_labels)

train_loader = torch.utils.data.DataLoader(tensor_train_set, batch_size=batch_size, shuffle=True)

val_loader = torch.utils.data.DataLoader(tensor_val_set, batch_size=batch_size, shuffle=True)

lr = 1e-3
lambda_l2 = 1e-3
loss_func = nn.CrossEntropyLoss().to(device)
optimizer = torch.optim.Adam(Model.parameters(), lr=lr, weight_decay=lambda_l2)



6088
6088


Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  tensor_train = torch.tensor(train_data.values.astype(np.float), dtype=torch.float32).to(device)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  tensor_train_labels = torch.tensor(train_target.values.astype(np.float), dtype=torch.float32).to(device)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  tensor_val = torch.tensor(val_data.values.astype(np.float), dtype=torch.float32).to(device)
Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations
  tensor_val_labels = torch.tensor(val_target.values.astype(np.float), dtype=torch.float32).to(device)


In [213]:
def train_model(epochs, model):
    model.train()
    for epoch in range(epochs):
      for batch_id, (data, label) in enumerate(train_loader):
        #print(batch_idx)


        pred = Model.forward(data)
        print(type(pred[0][0].item()))

        optimizer.zero_grad()
        loss = loss_func(pred.type(torch.LongTensor), label.type(torch.LongTensor))
        #print(loss)
        loss.backward()
        optimizer.step()
        if batch_id % 60 == 0:
          print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
            epoch, batch_id * len(data), len(train_loader.dataset),
            100. * batch_id / len(train_loader), loss.item()))

In [214]:
train_model(1, Model)

<class 'float'>


RuntimeError: "log_softmax_lastdim_kernel_impl" not implemented for 'Long'