## Import Libraries

In [1]:
!pip install emoji transformers
!pip install sentencepiece
!pip install tensorflow_addons

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from tqdm.notebook import tqdm
import emoji

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedShuffleSplit

import tensorflow as tf
import tensorflow.keras as keras
from tensorflow.keras.layers import Dense, Input, Dropout
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
import tensorflow_addons as tfa

from collections import Counter
from functools import partial

import transformers
from tokenizers import BertWordPieceTokenizer

AUTO = tf.data.experimental.AUTOTUNE

Collecting emoji
[?25l  Downloading https://files.pythonhosted.org/packages/24/fa/b3368f41b95a286f8d300e323449ab4e86b85334c2e0b477e94422b8ed0f/emoji-1.2.0-py3-none-any.whl (131kB)
[K     |████████████████████████████████| 133kB 5.9MB/s 
[?25hCollecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/f9/54/5ca07ec9569d2f232f3166de5457b63943882f7950ddfcc887732fc7fb23/transformers-4.3.3-py3-none-any.whl (1.9MB)
[K     |████████████████████████████████| 1.9MB 29.0MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 51.2MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |██████████████████████████

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## Preprocessing Functions

In [61]:
def emoji_cleaning(text):
    
  # Change emoji to text
  text = emoji.demojize(text).replace(":", " ")
  
  # Delete repeated emoji
  tokenizer = text.split()
  repeated_list = []
  
  for word in tokenizer:
      if word not in repeated_list:
          repeated_list.append(word)
  
  text = ' '.join(text for text in repeated_list)
  text = text.replace("_", " ").replace("-", " ")
  return text

def clean_smileys(text):
    
    text = re.sub(r'(:\)|: \)|\(\:|:-\)|: -\)|: - \)|:D|: D)', ' smile ', text)
    text = re.sub(r'(:\(|: \(|\)\:|:-\(|: -\(|: - \(|:\'\()', ' dislike ', text)
    text = re.sub(r'(<3)', ' heart ', text)
    text = re.sub(r'(:/)', ' dislike ', text)
    text = re.sub(r'(;\)|; \))', ' wink ', text)
    return ' '.join([word for word in text.split()])

def clean_urls(review):
    review = review.split()
    review = ' '.join([word for word in review if not re.match('^http', word)])
    return review

def decontracted(text):
    text = re.sub(r"won\’t", "will not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"don't", "do not", text)
    text = re.sub(r"can\’t", "can not", text)
    text = re.sub(r"n\’t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"it\'s", "it is", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\’m", " am", text)

    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\’re", " are", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\’d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\’ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\’t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\’ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\’m", " am", text)
    text = re.sub(r"\“", "", text)
    text = re.sub(r"\”", "", text)
    text = re.sub(r"\…", "", text)

    return text

def clean_text(text):
    text = str(text)
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    text = re.sub(r'[^a-zA-Z ]+', ' ', text)
    text = re.sub(r'http\S+', ' ', text)
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text)
    text = re.sub(r'^RT[\s]+', '', text)
    text = re.sub(r'pic.twitter\S+', ' ', text)
    text = re.sub(r'#', '', text)
    text = text.lower()

    return text

In [None]:
data_folder = "/content/drive/MyDrive/CZ4034 - Information Retrieval/Data"

In [8]:
df = pd.read_csv(f'{data_folder}/train_tweets.csv')
df['tweet_id'] = df['tweet_id'].astype(str)
df.head()

Unnamed: 0,text,created_date_time,tweet_id,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,location,user_geo,link,toxic,subjectivity
0,rt rulestwt if uncomfortable was a picture,2/15/2021 19:00,1.36e+18,𝓐𝓷𝓪 𝓒𝓻𝓲𝓼𝓽𝓲𝓷𝓪 ✨,bemAtoaAqui,1.18e+18,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal",♐️//#SimAosToiros// F 💓,False,,613,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal","(38.145868899999996, -7.36681873826084, 0.0)",https://twitter.com/twitter/status/13613900200...,0,0
1,rt rwpusa solicitation of election fraud is a ...,2/15/2021 19:00,1.36e+18,Brad Bartram,bradbartram,37202990.0,"The Southern, East 5th Street, Skid Row, Downt...",a believer,False,,3967,"The Southern, East 5th Street, Skid Row, Downt...","(34.044146350000005, -118.24466336743033, 0.0)",https://twitter.com/twitter/status/13613900006...,0,0
2,rt thattimwalker i seem to recall thatginamill...,2/15/2021 19:00,1.36e+18,Sarah Craig 🇪🇺💙 #3.5% #FBPE #RejoinEU,sarahcraig52,1480613000.0,"Cheshire, England, United Kingdom",Wear a mask. Old & grey but only slightly wise...,False,,271,"Cheshire, England, United Kingdom","(53.2141028, -2.471770086071205, 0.0)",https://twitter.com/twitter/status/13613899934...,0,0
3,itzsohamx uhmm i think this tweet is not for me,2/15/2021 19:00,1.36e+18,ʂıɖɖɧı🍕,DeepveerLuv,9.57e+17,"Mumbai, Mumbai Suburban, Maharashtra, India",Madly and deeply crazy for deepveer🌍 Sadda Haq...,False,,0,"Mumbai, Mumbai Suburban, Maharashtra, India","(19.0759899, 72.8773928, 0.0)",https://twitter.com/twitter/status/13613900433...,0,0
4,open closed and lock your door at the push of ...,2/15/2021 19:00,1.36e+18,ASSA ABLOY Ent US,assaabloyentus,156391400.0,United States,Look to ASSA ABLOY Entrance Systems US for hig...,False,,0,United States,"(39.7837304, -100.4458825, 0.0)",https://twitter.com/twitter/status/13613900369...,0,0


In [62]:
df2 = pd.read_csv(f'{data_folder}/tweets_with_labels_url.csv')
df2['tweet_id'] = df2['tweet_id'].astype(str)
df2['text'] = df2['text'].astype(str)
df2.head()

Unnamed: 0,tweet_id,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,location,user_geo,neg,neu,pos,compound,toxic,severe_toxic,subjectivity,url
0,1359452827198754817,"""don't be posting music they might think you b...",2021-02-10 10:43:06,Leah,leahkwilcox,878532756153655297,"Boise, Ada County, Idaho, United States",fuck it we ballin’ 💎 40% Stubborn 60% Water 10...,False,,0,"Boise, Ada County, Idaho, United States","(43.6166163, -116.200886, 0.0)",0.417,0.521,0.062,-0.936,1,1,1,https://twitter.com/twitter/status/13594528271...
1,1361144583057416193,@MakoMutt you whore,2021-02-15 02:45:32,orion,Aluminemsiren,1703878068,"Texas, United States",30/M/gay. sometimes draws cute furries. gay fo...,False,,0,"Texas, United States","(31.8160381, -99.5120986, 0.0)",0.683,0.317,0.0,-0.6486,1,0,1,https://twitter.com/twitter/status/13611445830...
2,1361153917036470273,RT @BritneyHiatus: Justin Timberlake slut sham...,2021-02-15 03:22:37,tedfoxgains,tedfoxgains,1348302112107753474,"London, Greater London, England, United Kingdom","(NSFW, 🔞, gay couple). We make #Gainer videos,...",False,,26897,"London, Greater London, England, United Kingdom","(51.5073219, -0.1276474, 0.0)",0.402,0.598,0.0,-0.8126,0,0,0,https://twitter.com/twitter/status/13611539170...
3,1359227172569817089,RT @TheStanchion: Seriously just imagine you f...,2021-02-09 19:46:25,Pells Bells,kpelly,37878044,"Vancouver, District of North Vancouver, Britis...",----- I follow back,False,,157,"Vancouver, District of North Vancouver, Britis...","(49.2608724, -123.1139529, 0.0)",0.139,0.68,0.18,0.296,0,0,0,https://twitter.com/twitter/status/13592271725...
4,1360382365147815939,@AshIsFluffed If you could live in any fiction...,2021-02-13 00:16:45,NozieLess,NozieLess,3010674889,"Mountains, 198, Möserer Straße, Gemeinde Seefe...","🎮 Welcome to my Twitter. 💾\n\n20, Gamer, Nerd,...",False,,0,"Mountains, 198, Möserer Straße, Gemeinde Seefe...","(47.32770845, 11.180902759051147, 0.0)",0.0,1.0,0.0,0.0,0,0,0,https://twitter.com/twitter/status/13603823651...


In [None]:
toxic = df2['toxic'].values.tolist()
severe_toxic = df2['severe_toxic'].values.tolist()
toxic_class = []

for i in range(0,len(toxic)):
  if toxic[i] == 1 and severe_toxic[i] == 1:
    toxic_class.append(2)
  elif toxic[i] == 1 and severe_toxic[i] == 0:
    toxic_class.append(1)
  else:
    toxic_class.append(0)

toxic_class = np.array(toxic_class)
df2['toxic_class'] = toxic_class

In [None]:
df_sub = df2[["tweet_id","text", "subjectivity","toxic_class"]]
df_sub.head()

Unnamed: 0,tweet_id,text,subjectivity,toxic_class
0,1359452827198754817,"""don't be posting music they might think you b...",1,2
1,1361144583057416193,@MakoMutt you whore,1,1
2,1361153917036470273,RT @BritneyHiatus: Justin Timberlake slut sham...,0,0
3,1359227172569817089,RT @TheStanchion: Seriously just imagine you f...,0,0
4,1360382365147815939,@AshIsFluffed If you could live in any fiction...,0,0


In [None]:
df_sub['text'] = df_sub['text'].apply(clean_urls).apply(clean_text).apply(emoji_cleaning).apply(clean_smileys).apply(decontracted)
df_sub.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,tweet_id,text,subjectivity,toxic_class
0,1359452827198754817,don t be posting music they might think you cr...,1,2
1,1361144583057416193,makomutt you whore,1,1
2,1361153917036470273,britneyhiatus justin timberlake slut shamed br...,0,0
3,1359227172569817089,thestanchion seriously just imagine you fire a...,0,0
4,1360382365147815939,ashisfluffed if you could live in any fictiona...,0,0
5,1361159519443816448,chases pet damn right you are gt nothing but a...,1,2
6,1360086313995825153,we in the age of where fucking before even go ...,0,0
7,1359930570369871884,ronaldgooch you follow magats back the fuck of...,1,2
8,1360883701958205443,harsh scuderiaferrari carlossainz charles lecl...,0,0
9,1359754741648154625,thinking about brad in ep bc he was so gorgeou...,1,0


In [None]:
df_sub2 = df[["tweet_id","text", "subjectivity","toxic"]]
df_sub.rename({'toxic':'toxic_class'},axis=1,inplace=True)
df_sub2.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,tweet_id,text,subjectivity,toxic
0,1.36e+18,rt rulestwt if uncomfortable was a picture,0,0
1,1.36e+18,rt rwpusa solicitation of election fraud is a ...,0,0
2,1.36e+18,rt thattimwalker i seem to recall thatginamill...,0,0
3,1.36e+18,itzsohamx uhmm i think this tweet is not for me,0,0
4,1.36e+18,open closed and lock your door at the push of ...,0,0


In [None]:
df_sub2['text'] = df_sub2['text'].apply(clean_urls).apply(clean_text).apply(emoji_cleaning).apply(clean_smileys).apply(decontracted)
df_sub2.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,tweet_id,text,subjectivity,toxic
0,1.36e+18,rt rulestwt if uncomfortable was a picture,0,0
1,1.36e+18,rt rwpusa solicitation of election fraud is a ...,0,0
2,1.36e+18,rt thattimwalker i seem to recall thatginamill...,0,0
3,1.36e+18,itzsohamx uhmm i think this tweet is not for me,0,0
4,1.36e+18,open closed and lock your door at the push of ...,0,0
5,1.36e+18,rt winchesterhist four of our y students are r...,0,0
6,1.36e+18,rt rvareid when meg was shot cyhi kept tweetin...,0,0
7,1.36e+18,rt vivekagnihotri ajmal kasab was yr old yasee...,0,0
8,1.36e+18,my wife just told son boys are and girls i thi...,0,0
9,1.36e+18,lindsaylohan rariblecom i would buy tattoo sup...,0,0


In [None]:
def delete_repeated_char(text):
    text = re.sub(r'(\w)\1{2,}', r'\1', text)
    return text
def remove_punc(tweet):
    tweet =  tweet.translate(str.maketrans('', '', string.punctuation))
    tweet = ' '.join([word for word in tweet.split()])
    tweet = tweet.lower()
    return tweet

In [None]:
# df_sub = df_sub.append(df_sub2)
df_sub['text'] = df_sub['text'].astype(str)
df_sub['text'] = df_sub['text'].apply(delete_repeated_char).apply(remove_punc)
df_sub['count'] = df_sub['text'].str.split().map(len)
drop_indexes = df_sub.loc[df_sub['count']==0].index.tolist()
df_sub = df_sub.drop(drop_indexes)

df_sub = df_sub.drop_duplicates(subset=['text'])

df_sub = df_sub.sample(frac=1, random_state=42)
df_sub.head()

Unnamed: 0,id,comment_text,subjectivity,toxic_class,tweet_id,text,toxic,count
1124,,,1,,1.36e+18,asaucegxd dont flex whore,1.0,4
1029,,,1,,1.36e+18,lindseygrahamsc americans are ashamed of you h...,1.0,17
1682,,,1,,1.36e+18,rt arvlnder came up off doge and bought a whol...,1.0,25
1540,,,0,,1.36e+18,troche you gotta go to war and die smh,0.0,9
343,,,0,,1.36e+18,rt strongblacklead every legend has an origin ...,0.0,21


In [None]:
df_sub = df_sub[["tweet_id","text", "subjectivity","toxic"]]
df_sub.head()

Unnamed: 0,tweet_id,text,subjectivity,toxic
1124,1.36e+18,asaucegxd dont flex whore,1,1.0
1029,1.36e+18,lindseygrahamsc americans are ashamed of you h...,1,1.0
1682,1.36e+18,rt arvlnder came up off doge and bought a whol...,1,1.0
1540,1.36e+18,troche you gotta go to war and die smh,0,0.0
343,1.36e+18,rt strongblacklead every legend has an origin ...,0,0.0


In [None]:
df_sub.rename({'tweet_id':'id','text':'comment_text'},axis=1,inplace=True)
df_sub.head()

Unnamed: 0,id,comment_text,subjectivity,toxic
1124,1.36e+18,asaucegxd dont flex whore,1,1.0
1029,1.36e+18,lindseygrahamsc americans are ashamed of you h...,1,1.0
1682,1.36e+18,rt arvlnder came up off doge and bought a whol...,1,1.0
1540,1.36e+18,troche you gotta go to war and die smh,0,0.0
343,1.36e+18,rt strongblacklead every legend has an origin ...,0,0.0


In [None]:
df_sub.rename({'subjectivity':'neutral'},axis=1,inplace=True)
df_sub.head()

Unnamed: 0,id,comment_text,neutral,toxic
1124,1.36e+18,asaucegxd dont flex whore,1,1.0
1029,1.36e+18,lindseygrahamsc americans are ashamed of you h...,1,1.0
1682,1.36e+18,rt arvlnder came up off doge and bought a whol...,1,1.0
1540,1.36e+18,troche you gotta go to war and die smh,0,0.0
343,1.36e+18,rt strongblacklead every legend has an origin ...,0,0.0


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical


X_train, X_test, y_train, y_test = train_test_split(df_sub.comment_text.values, df_sub.id.values, test_size=0.5, random_state=42, stratify=df_sub[['toxic_class','neutral']])

In [None]:
data_df = df_sub[df_sub['id'].isin(y_train)]
data_df.head()

Unnamed: 0,id,comment_text,neutral,toxic_class
2,1361390000630157313,rwpusa solicitation of election fraud is a fel...,0,0
3,1361390047426134017,folkloredean the we needed therapy but instead...,1,0
4,1361389993437052928,thattimwalker i seem to recall thatginamiller ...,0,0
7,1361390033748525059,v wow we have lady flotus our s shine brightr ...,0,0
8,1361389986537439237,official mx jp i m first solo digital mini alb...,0,0


In [None]:
data_df = df_sub
data_df.head()

Unnamed: 0,id,comment_text,neutral,toxic
1124,1.36e+18,asaucegxd dont flex whore,1,1.0
1029,1.36e+18,lindseygrahamsc americans are ashamed of you h...,1,1.0
1682,1.36e+18,rt arvlnder came up off doge and bought a whol...,1,1.0
1540,1.36e+18,troche you gotta go to war and die smh,0,0.0
343,1.36e+18,rt strongblacklead every legend has an origin ...,0,0.0


In [None]:
test_df = df_sub[df_sub['id'].isin(y_test)]
test_df.head()

Unnamed: 0,id,comment_text,neutral,toxic_class
0,1361390039108907008,trumpwarroom rt to wish a happy president s da...,0,0
1,1361390020008038402,rules twt if uncomfortable was a picture t co ...,0,0
5,1361389991050563590,rachelelisep i love that no one in texas owns ...,1,0
6,1361390043311599618,itzsohamx uhmm i think this tweet is not for me,0,0
9,1361390036919451648,open closed and lock your door at the push of ...,0,0


In [None]:
data_df.rename({'neutral':'subjectivity','toxic_class':'toxicity'},axis=1,inplace=True)
data_df.head()

Unnamed: 0,id,comment_text,subjectivity,toxic
1124,1.36e+18,asaucegxd dont flex whore,1,1.0
1029,1.36e+18,lindseygrahamsc americans are ashamed of you h...,1,1.0
1682,1.36e+18,rt arvlnder came up off doge and bought a whol...,1,1.0
1540,1.36e+18,troche you gotta go to war and die smh,0,0.0
343,1.36e+18,rt strongblacklead every legend has an origin ...,0,0.0


In [None]:
test_df.rename({'neutral':'subjectivity','toxic_class':'toxicity'},axis=1,inplace=True)
test_df.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Unnamed: 0,id,comment_text,subjectivity,toxicity
0,1361390039108907008,trumpwarroom rt to wish a happy president s da...,0,0
1,1361390020008038402,rules twt if uncomfortable was a picture t co ...,0,0
5,1361389991050563590,rachelelisep i love that no one in texas owns ...,1,0
6,1361390043311599618,itzsohamx uhmm i think this tweet is not for me,0,0
9,1361390036919451648,open closed and lock your door at the push of ...,0,0


In [None]:
data_df.to_csv(f'{data_folder}/preprocessed_neg_train.csv',index=False)

In [None]:
test_df.to_csv(f'{data_folder}/preprocessed_neg_test.csv',index=False)

In [None]:
df = pd.read_csv(f'{data_folder}/preprocessed_neg_test.csv')
df.head()

Unnamed: 0,id,comment_text,subjectivity,toxicity
0,1361390039108907008,trumpwarroom rt to wish a happy president s da...,0,0
1,1361390020008038402,rules twt if uncomfortable was a picture t co ...,0,0
2,1361389991050563590,rachelelisep i love that no one in texas owns ...,1,0
3,1361390043311599618,itzsohamx uhmm i think this tweet is not for me,0,0
4,1361390036919451648,open closed and lock your door at the push of ...,0,0


In [None]:
df = pd.read_csv(f'{data_folder}/train_tweets.csv')
df.head()

Unnamed: 0,text,created_date_time,tweet_id,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,location,user_geo,link,toxic,subjectivity
0,rt rulestwt if uncomfortable was a picture,2/15/2021 19:00,1.36e+18,𝓐𝓷𝓪 𝓒𝓻𝓲𝓼𝓽𝓲𝓷𝓪 ✨,bemAtoaAqui,1.18e+18,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal",♐️//#SimAosToiros// F 💓,False,,613,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal","(38.145868899999996, -7.36681873826084, 0.0)",https://twitter.com/twitter/status/13613900200...,0,0
1,rt rwpusa solicitation of election fraud is a ...,2/15/2021 19:00,1.36e+18,Brad Bartram,bradbartram,37202990.0,"The Southern, East 5th Street, Skid Row, Downt...",a believer,False,,3967,"The Southern, East 5th Street, Skid Row, Downt...","(34.044146350000005, -118.24466336743033, 0.0)",https://twitter.com/twitter/status/13613900006...,0,0
2,rt thattimwalker i seem to recall thatginamill...,2/15/2021 19:00,1.36e+18,Sarah Craig 🇪🇺💙 #3.5% #FBPE #RejoinEU,sarahcraig52,1480613000.0,"Cheshire, England, United Kingdom",Wear a mask. Old & grey but only slightly wise...,False,,271,"Cheshire, England, United Kingdom","(53.2141028, -2.471770086071205, 0.0)",https://twitter.com/twitter/status/13613899934...,0,0
3,itzsohamx uhmm i think this tweet is not for me,2/15/2021 19:00,1.36e+18,ʂıɖɖɧı🍕,DeepveerLuv,9.57e+17,"Mumbai, Mumbai Suburban, Maharashtra, India",Madly and deeply crazy for deepveer🌍 Sadda Haq...,False,,0,"Mumbai, Mumbai Suburban, Maharashtra, India","(19.0759899, 72.8773928, 0.0)",https://twitter.com/twitter/status/13613900433...,0,0
4,open closed and lock your door at the push of ...,2/15/2021 19:00,1.36e+18,ASSA ABLOY Ent US,assaabloyentus,156391400.0,United States,Look to ASSA ABLOY Entrance Systems US for hig...,False,,0,United States,"(39.7837304, -100.4458825, 0.0)",https://twitter.com/twitter/status/13613900369...,0,0


In [None]:
df = df[["text", "subjectivity","toxic"]]
df.head()

Unnamed: 0,text,subjectivity,toxic
0,rt rulestwt if uncomfortable was a picture,0,0
1,rt rwpusa solicitation of election fraud is a ...,0,0
2,rt thattimwalker i seem to recall thatginamill...,0,0
3,itzsohamx uhmm i think this tweet is not for me,0,0
4,open closed and lock your door at the push of ...,0,0


In [None]:
df.rename({'toxic':'toxicity','text':'comment_text'},axis=1,inplace=True)
df.head()

Unnamed: 0,comment_text,subjectivity,toxicity
0,rt rulestwt if uncomfortable was a picture,0,0
1,rt rwpusa solicitation of election fraud is a ...,0,0
2,rt thattimwalker i seem to recall thatginamill...,0,0
3,itzsohamx uhmm i think this tweet is not for me,0,0
4,open closed and lock your door at the push of ...,0,0


In [None]:
df.to_csv(f'{data_folder}/preprocessed_neg_train.csv',index=False)

## Combined labeled dataset

In [None]:
df = pd.read_csv(f'{data_folder}/neutral_tweets_labeled.csv')
df['tweet_id'] = df['tweet_id'].astype(str)
df.set_index('tweet_id', inplace=True)
df.head()

Unnamed: 0_level_0,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,link,url,subjectivity,toxicity
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
1361390039108907008,RT @TrumpWarRoom___: RT to wish a Happy Presid...,2021-02-15 19:00:53,VaQuireboy,VaQuireboy,742076049618538496,"Virginia, United States","Family man, College educated, Independent, Vie...",False,,2124,"(37.1232245, -78.4927721, 0.0)",https://twitter.com/twitter/status/13613900391...,,1,0
1361390020008038402,"RT @Rules_twt: If ""uncomfortable"" was a pictur...",2021-02-15 19:00:48,𝓐𝓷𝓪 𝓒𝓻𝓲𝓼𝓽𝓲𝓷𝓪 ✨,bemAtoaAqui,1182740439184658438,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal",♐️//#SimAosToiros// F 💓,False,,613,"(38.145868899999996, -7.36681873826084, 0.0)",https://twitter.com/twitter/status/13613900200...,,1,0
1361390012466667520,RT @_ixcato: I think he's neat. :) #criticalro...,2021-02-15 19:00:47,Spectre’s Icy Touch,spectrealafete,40359764,"Toronto, Golden Horseshoe, Ontario, Canada",Mostly just flailing about changelings... and ...,False,,108,"(43.6534817, -79.3839347, 0.0)",https://twitter.com/twitter/status/13613900124...,,1,0
1361390051448545280,RT @Tomas_Verde: Who's the brave soul who is g...,2021-02-15 19:00:56,Taylor Moats,idigmoats,836340324205199361,"Whitemarsh Island, Chatham County, Georgia, 31...",,False,,26,"(32.0327121, -81.0142786, 0.0)",https://twitter.com/twitter/status/13613900514...,,1,0
1361390038056075267,@kenndold @RachelFields_ I really hope this is...,2021-02-15 19:00:53,Alejandra Felix,alexxfelixx95,1297156705,"Cambridge, Middlesex County, Massachusetts, Un...",tiny girl pretending to do chemistry by day @H...,False,,0,"(42.3750997, -71.1056157, 0.0)",https://twitter.com/twitter/status/13613900380...,,1,0


In [None]:
df2 = pd.read_csv(f'{data_folder}/negative_tweets_labeled.csv')
df2['tweet_id'] = df2['tweet_id'].astype(str)
df2.set_index('tweet_id', inplace=True)
df2.drop(['location'], axis='columns', inplace=True)
df2.head()

Unnamed: 0_level_0,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,url,subjectivity,toxicity
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1361169823707918336,It was Dio all along! Happy dumb holiday! 🖤 ht...,2021-02-15 04:25:49,𝕕𝕒𝕟𝕚𝕖𝕝𝕝𝕖 𝕤𝕝𝕒𝕦𝕥𝕖𝕣,DanielleSlauter,66803433,"Indianapolis, Marion, Indiana, United States",•✞•Level 26•INFJ•Aries•anime trash•@waifuwatch...,False,,0,"(39.7683331, -86.1583502, 0.0)",https://twitter.com/twitter/status/13611698237...,1,0
1360681569527222275,RT @AndrewStoeten: We could stop the pandemic....,2021-02-13 20:05:41,jordan meuIendyk,jordym1nine,19632321,"Toronto, Golden Horseshoe, Ontario, Canada",former child actor. chief of the BPD. human sh...,False,,166,"(43.6534817, -79.3839347, 0.0)",https://twitter.com/twitter/status/13606815695...,0,0
1359821276269993985,RT @TachaSmar: @Blessin50078066 @Chinyere2231 ...,2021-02-11 11:07:11,Renaissance🔱 ˢᵀ,renegadelia,1228398056602787841,"Universal, Veranópolis, Região Geográfica Imed...","Write your own verse!\r\n\r\nI am abundance, I...",False,,7,"(-28.9182121, -51.547104, 0.0)",https://twitter.com/twitter/status/13598212762...,1,1
1360730455146975238,not me going off on a grown man in the middle ...,2021-02-13 23:19:56,paula,uhhpaula,4540327527,"South Florida, Columbus, Cherokee County, Kans...",aries,False,,0,"(37.1677128, -94.8464256, 0.0)",https://twitter.com/twitter/status/13607304551...,0,0
1360393239107403778,RT @SenDuckworth: If you go just 10 stops down...,2021-02-13 00:59:57,Shannon Fx Halbur,UnderTallSWH,3212694291,"Raleigh, Wake County, North Carolina, United S...","Gun Violence Prevention Activist, with Moms De...",False,,276,"(35.7803977, -78.6390989, 0.0)",https://twitter.com/twitter/status/13603932391...,0,0


In [None]:
df.drop(['url'], axis='columns', inplace=True)
df.rename({'link':'url'},axis=1,inplace=True)
df.head()

Unnamed: 0_level_0,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,url,subjectivity,toxicity
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1361390039108907008,RT @TrumpWarRoom___: RT to wish a Happy Presid...,2021-02-15 19:00:53,VaQuireboy,VaQuireboy,742076049618538496,"Virginia, United States","Family man, College educated, Independent, Vie...",False,,2124,"(37.1232245, -78.4927721, 0.0)",https://twitter.com/twitter/status/13613900391...,1,0
1361390020008038402,"RT @Rules_twt: If ""uncomfortable"" was a pictur...",2021-02-15 19:00:48,𝓐𝓷𝓪 𝓒𝓻𝓲𝓼𝓽𝓲𝓷𝓪 ✨,bemAtoaAqui,1182740439184658438,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal",♐️//#SimAosToiros// F 💓,False,,613,"(38.145868899999996, -7.36681873826084, 0.0)",https://twitter.com/twitter/status/13613900200...,1,0
1361390012466667520,RT @_ixcato: I think he's neat. :) #criticalro...,2021-02-15 19:00:47,Spectre’s Icy Touch,spectrealafete,40359764,"Toronto, Golden Horseshoe, Ontario, Canada",Mostly just flailing about changelings... and ...,False,,108,"(43.6534817, -79.3839347, 0.0)",https://twitter.com/twitter/status/13613900124...,1,0
1361390051448545280,RT @Tomas_Verde: Who's the brave soul who is g...,2021-02-15 19:00:56,Taylor Moats,idigmoats,836340324205199361,"Whitemarsh Island, Chatham County, Georgia, 31...",,False,,26,"(32.0327121, -81.0142786, 0.0)",https://twitter.com/twitter/status/13613900514...,1,0
1361390038056075267,@kenndold @RachelFields_ I really hope this is...,2021-02-15 19:00:53,Alejandra Felix,alexxfelixx95,1297156705,"Cambridge, Middlesex County, Massachusetts, Un...",tiny girl pretending to do chemistry by day @H...,False,,0,"(42.3750997, -71.1056157, 0.0)",https://twitter.com/twitter/status/13613900380...,1,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 10993 entries, 1361390039108907008 to 1360542664563716099
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               10993 non-null  object
 1   created_date_time  10993 non-null  object
 2   username           10992 non-null  object
 3   user_screen_name   10993 non-null  object
 4   user_id            10993 non-null  int64 
 5   user_location      10993 non-null  object
 6   user_description   10161 non-null  object
 7   verified           10993 non-null  bool  
 8   associated_place   179 non-null    object
 9   retweet_count      10993 non-null  int64 
 10  user_geo           10993 non-null  object
 11  url                5147 non-null   object
 12  subjectivity       10993 non-null  int64 
 13  toxicity           10993 non-null  int64 
dtypes: bool(1), int64(4), object(9)
memory usage: 1.5+ MB


In [None]:
df = df.append(df2)
df.head()

Unnamed: 0_level_0,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,url,subjectivity,toxicity
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1361390039108907008,RT @TrumpWarRoom___: RT to wish a Happy Presid...,2021-02-15 19:00:53,VaQuireboy,VaQuireboy,742076049618538496,"Virginia, United States","Family man, College educated, Independent, Vie...",False,,2124,"(37.1232245, -78.4927721, 0.0)",https://twitter.com/twitter/status/13613900391...,1,0
1361390020008038402,"RT @Rules_twt: If ""uncomfortable"" was a pictur...",2021-02-15 19:00:48,𝓐𝓷𝓪 𝓒𝓻𝓲𝓼𝓽𝓲𝓷𝓪 ✨,bemAtoaAqui,1182740439184658438,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal",♐️//#SimAosToiros// F 💓,False,,613,"(38.145868899999996, -7.36681873826084, 0.0)",https://twitter.com/twitter/status/13613900200...,1,0
1361390012466667520,RT @_ixcato: I think he's neat. :) #criticalro...,2021-02-15 19:00:47,Spectre’s Icy Touch,spectrealafete,40359764,"Toronto, Golden Horseshoe, Ontario, Canada",Mostly just flailing about changelings... and ...,False,,108,"(43.6534817, -79.3839347, 0.0)",https://twitter.com/twitter/status/13613900124...,1,0
1361390051448545280,RT @Tomas_Verde: Who's the brave soul who is g...,2021-02-15 19:00:56,Taylor Moats,idigmoats,836340324205199361,"Whitemarsh Island, Chatham County, Georgia, 31...",,False,,26,"(32.0327121, -81.0142786, 0.0)",https://twitter.com/twitter/status/13613900514...,1,0
1361390038056075267,@kenndold @RachelFields_ I really hope this is...,2021-02-15 19:00:53,Alejandra Felix,alexxfelixx95,1297156705,"Cambridge, Middlesex County, Massachusetts, Un...",tiny girl pretending to do chemistry by day @H...,False,,0,"(42.3750997, -71.1056157, 0.0)",https://twitter.com/twitter/status/13613900380...,1,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14839 entries, 1361390039108907008 to 1359742352680648705
Data columns (total 14 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   text               14839 non-null  object
 1   created_date_time  14839 non-null  object
 2   username           14838 non-null  object
 3   user_screen_name   14839 non-null  object
 4   user_id            14839 non-null  int64 
 5   user_location      14839 non-null  object
 6   user_description   13705 non-null  object
 7   verified           14839 non-null  bool  
 8   associated_place   239 non-null    object
 9   retweet_count      14839 non-null  int64 
 10  user_geo           14839 non-null  object
 11  url                8993 non-null   object
 12  subjectivity       14839 non-null  int64 
 13  toxicity           14839 non-null  int64 
dtypes: bool(1), int64(4), object(9)
memory usage: 1.6+ MB


In [None]:
dfl = pd.read_csv(f'{data_folder}/tweets_with_labels_url.csv')
dfl['tweet_id'] = dfl['tweet_id'].astype(str)
# dfl.drop(['location'], axis='columns', inplace=True)
dfl.head()

Unnamed: 0,tweet_id,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,link,toxic,severe_toxic,subjectivity
0,1.36e+18,"""don't be posting music they might think you b...",10/2/2021 10:43,Leah,leahkwilcox,8.79e+17,"Boise, Ada County, Idaho, United States",fuck it we ballin’ 💎 40% Stubborn 60% Water 10...,False,,0,"(43.6166163, -116.200886, 0.0)",https://twitter.com/twitter/status/13594528271...,1,1,1
1,1.36e+18,@MakoMutt you whore,15/2/2021 2:45,orion,Aluminemsiren,1703878000.0,"Texas, United States",30/M/gay. sometimes draws cute furries. gay fo...,False,,0,"(31.8160381, -99.5120986, 0.0)",https://twitter.com/twitter/status/13611445830...,1,0,1
2,1.36e+18,RT @BritneyHiatus: Justin Timberlake slut sham...,15/2/2021 3:22,tedfoxgains,tedfoxgains,1.35e+18,"London, Greater London, England, United Kingdom","(NSFW, 🔞, gay couple). We make #Gainer videos,...",False,,26897,"(51.5073219, -0.1276474, 0.0)",https://twitter.com/twitter/status/13611539170...,0,0,0
3,1.36e+18,RT @TheStanchion: Seriously just imagine you f...,9/2/2021 19:46,Pells Bells,kpelly,37878040.0,"Vancouver, District of North Vancouver, Britis...",#NAME?,False,,157,"(49.2608724, -123.1139529, 0.0)",https://twitter.com/twitter/status/13592271725...,0,0,0
4,1.36e+18,@AshIsFluffed If you could live in any fiction...,13/2/2021 0:16,NozieLess,NozieLess,3010675000.0,"Mountains, 198, Möserer Straße, Gemeinde Seefe...","🎮 Welcome to my Twitter. 💾\r\n\r\n20, Gamer, N...",False,,0,"(47.32770845, 11.180902759051147, 0.0)",https://twitter.com/twitter/status/13603823651...,0,0,0


In [None]:
toxic = dfl['toxic'].values.tolist()
severe_toxic = dfl['severe_toxic'].values.tolist()
toxic_class = []

for i in range(0,len(toxic)):
  if toxic[i] == 1 and severe_toxic[i] == 1:
    toxic_class.append(2)
  elif toxic[i] == 1 and severe_toxic[i] == 0:
    toxic_class.append(1)
  else:
    toxic_class.append(0)

toxic_class = np.array(toxic_class)
dfl['toxicity'] = toxic_class
dfl.drop(['toxic','severe_toxic'], axis='columns', inplace=True)
dfl.head()

Unnamed: 0,tweet_id,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,link,subjectivity,toxicity
0,1.36e+18,"""don't be posting music they might think you b...",10/2/2021 10:43,Leah,leahkwilcox,8.79e+17,"Boise, Ada County, Idaho, United States",fuck it we ballin’ 💎 40% Stubborn 60% Water 10...,False,,0,"(43.6166163, -116.200886, 0.0)",https://twitter.com/twitter/status/13594528271...,1,2
1,1.36e+18,@MakoMutt you whore,15/2/2021 2:45,orion,Aluminemsiren,1703878000.0,"Texas, United States",30/M/gay. sometimes draws cute furries. gay fo...,False,,0,"(31.8160381, -99.5120986, 0.0)",https://twitter.com/twitter/status/13611445830...,1,1
2,1.36e+18,RT @BritneyHiatus: Justin Timberlake slut sham...,15/2/2021 3:22,tedfoxgains,tedfoxgains,1.35e+18,"London, Greater London, England, United Kingdom","(NSFW, 🔞, gay couple). We make #Gainer videos,...",False,,26897,"(51.5073219, -0.1276474, 0.0)",https://twitter.com/twitter/status/13611539170...,0,0
3,1.36e+18,RT @TheStanchion: Seriously just imagine you f...,9/2/2021 19:46,Pells Bells,kpelly,37878040.0,"Vancouver, District of North Vancouver, Britis...",#NAME?,False,,157,"(49.2608724, -123.1139529, 0.0)",https://twitter.com/twitter/status/13592271725...,0,0
4,1.36e+18,@AshIsFluffed If you could live in any fiction...,13/2/2021 0:16,NozieLess,NozieLess,3010675000.0,"Mountains, 198, Möserer Straße, Gemeinde Seefe...","🎮 Welcome to my Twitter. 💾\r\n\r\n20, Gamer, N...",False,,0,"(47.32770845, 11.180902759051147, 0.0)",https://twitter.com/twitter/status/13603823651...,0,0


In [None]:
dfl = dfl[["tweet_id","text", "subjectivity","toxicity"]]
dfl.head()

Unnamed: 0,tweet_id,text,subjectivity,toxicity
0,1.36e+18,"""don't be posting music they might think you b...",1,2
1,1.36e+18,@MakoMutt you whore,1,1
2,1.36e+18,RT @BritneyHiatus: Justin Timberlake slut sham...,0,0
3,1.36e+18,RT @TheStanchion: Seriously just imagine you f...,0,0
4,1.36e+18,@AshIsFluffed If you could live in any fiction...,0,0


In [None]:
for i, row in dfl.iterrows():
  for j, row2 in df.iterrows():
    print(row2[0])
    if row[1] == row2[0]:
      print("\n")
      print(row[1])
      print(row[2])
      print(row[3])
      print("\n")
      df.at[j,'subjectivity'] = row[2]
      df.at[j,'toxicity'] = row[3]

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Julie: this show is for a younger audience 

*shows a man killing himself over and over again, consent doesnt matte… https://t.co/k9gic7Yt5E
@StevenBritt13 @ArizonaSage60 @JohnJHarwood Well, considering the correlated problem - that we have no requirements… https://t.co/aqZDsXzipi
RT @NigelGarbage: @BBCr4today Imagine being disabled and listening to you and David Davis opining about how many should die so that you can…
Everyone who followed kalina Callier needs to unfollow her stupid ass!
Again, Fuck Joss Whedon.

Yes, it is hard when your heroes let you down, but knowing the information we have now he… https://t.co/NMAIj2cuRf
@RepJoshHarder we are literally going to die. This needs to be fixed NOW! I have been calling since 12/28 every day… https://t.co/5iVwV3jgEz
RT @ANTINATALISTO: The possibilities are limitless... 

Except for the fact you will die, humanity will go extinct, the planet will be gobb…
RT @ZawThantKywe: 

In [None]:
df.head()

Unnamed: 0_level_0,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,url,subjectivity,toxicity
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
1361390039108907008,RT @TrumpWarRoom___: RT to wish a Happy Presid...,2021-02-15 19:00:53,VaQuireboy,VaQuireboy,742076049618538496,"Virginia, United States","Family man, College educated, Independent, Vie...",False,,2124,"(37.1232245, -78.4927721, 0.0)",https://twitter.com/twitter/status/13613900391...,1,0
1361390020008038402,"RT @Rules_twt: If ""uncomfortable"" was a pictur...",2021-02-15 19:00:48,𝓐𝓷𝓪 𝓒𝓻𝓲𝓼𝓽𝓲𝓷𝓪 ✨,bemAtoaAqui,1182740439184658438,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal",♐️//#SimAosToiros// F 💓,False,,613,"(38.145868899999996, -7.36681873826084, 0.0)",https://twitter.com/twitter/status/13613900200...,1,0
1361390012466667520,RT @_ixcato: I think he's neat. :) #criticalro...,2021-02-15 19:00:47,Spectre’s Icy Touch,spectrealafete,40359764,"Toronto, Golden Horseshoe, Ontario, Canada",Mostly just flailing about changelings... and ...,False,,108,"(43.6534817, -79.3839347, 0.0)",https://twitter.com/twitter/status/13613900124...,1,0
1361390051448545280,RT @Tomas_Verde: Who's the brave soul who is g...,2021-02-15 19:00:56,Taylor Moats,idigmoats,836340324205199361,"Whitemarsh Island, Chatham County, Georgia, 31...",,False,,26,"(32.0327121, -81.0142786, 0.0)",https://twitter.com/twitter/status/13613900514...,1,0
1361390038056075267,@kenndold @RachelFields_ I really hope this is...,2021-02-15 19:00:53,Alejandra Felix,alexxfelixx95,1297156705,"Cambridge, Middlesex County, Massachusetts, Un...",tiny girl pretending to do chemistry by day @H...,False,,0,"(42.3750997, -71.1056157, 0.0)",https://twitter.com/twitter/status/13613900380...,1,0


In [None]:
df.reset_index(level=0, inplace=True)
df['tweet_id'] = df['tweet_id'].astype(str)
df.head()

Unnamed: 0,tweet_id,text,created_date_time,username,user_screen_name,user_id,user_location,user_description,verified,associated_place,retweet_count,user_geo,url,subjectivity,toxicity
0,1361390039108907008,RT @TrumpWarRoom___: RT to wish a Happy Presid...,2021-02-15 19:00:53,VaQuireboy,VaQuireboy,742076049618538496,"Virginia, United States","Family man, College educated, Independent, Vie...",False,,2124,"(37.1232245, -78.4927721, 0.0)",https://twitter.com/twitter/status/13613900391...,1,0
1,1361390020008038402,"RT @Rules_twt: If ""uncomfortable"" was a pictur...",2021-02-15 19:00:48,𝓐𝓷𝓪 𝓒𝓻𝓲𝓼𝓽𝓲𝓷𝓪 ✨,bemAtoaAqui,1182740439184658438,"Moura, Beja, Baixo Alentejo, Alentejo, Portugal",♐️//#SimAosToiros// F 💓,False,,613,"(38.145868899999996, -7.36681873826084, 0.0)",https://twitter.com/twitter/status/13613900200...,1,0
2,1361390012466667520,RT @_ixcato: I think he's neat. :) #criticalro...,2021-02-15 19:00:47,Spectre’s Icy Touch,spectrealafete,40359764,"Toronto, Golden Horseshoe, Ontario, Canada",Mostly just flailing about changelings... and ...,False,,108,"(43.6534817, -79.3839347, 0.0)",https://twitter.com/twitter/status/13613900124...,1,0
3,1361390051448545280,RT @Tomas_Verde: Who's the brave soul who is g...,2021-02-15 19:00:56,Taylor Moats,idigmoats,836340324205199361,"Whitemarsh Island, Chatham County, Georgia, 31...",,False,,26,"(32.0327121, -81.0142786, 0.0)",https://twitter.com/twitter/status/13613900514...,1,0
4,1361390038056075267,@kenndold @RachelFields_ I really hope this is...,2021-02-15 19:00:53,Alejandra Felix,alexxfelixx95,1297156705,"Cambridge, Middlesex County, Massachusetts, Un...",tiny girl pretending to do chemistry by day @H...,False,,0,"(42.3750997, -71.1056157, 0.0)",https://twitter.com/twitter/status/13613900380...,1,0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14839 entries, 0 to 14838
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   tweet_id           14839 non-null  object
 1   text               14839 non-null  object
 2   created_date_time  14839 non-null  object
 3   username           14838 non-null  object
 4   user_screen_name   14839 non-null  object
 5   user_id            14839 non-null  int64 
 6   user_location      14839 non-null  object
 7   user_description   13705 non-null  object
 8   verified           14839 non-null  bool  
 9   associated_place   239 non-null    object
 10  retweet_count      14839 non-null  int64 
 11  user_geo           14839 non-null  object
 12  url                8993 non-null   object
 13  subjectivity       14839 non-null  int64 
 14  toxicity           14839 non-null  int64 
dtypes: bool(1), int64(4), object(10)
memory usage: 1.6+ MB


In [None]:
df.to_csv(f'{data_folder}/labeled_tweets.csv',index=False)