In [None]:
import numpy as np
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns

#data processing
import re, string
import nltk

from sklearn import preprocessing
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split


#Naive Bayes
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

#keras
import tensorflow as tf
from tensorflow import keras


#metrics
from sklearn.metrics import accuracy_score, f1_score
from sklearn.metrics import classification_report, confusion_matrix

#set seed for reproducibility
seed=42

#set style for plots
sns.set_style("whitegrid")
sns.despine()
plt.style.use("seaborn-whitegrid")
plt.rc("figure", autolayout=True)
plt.rc("axes", labelweight="bold", labelsize="large", titleweight="bold", titlepad=10)

  plt.style.use("seaborn-whitegrid")


<Figure size 640x480 with 0 Axes>

In [None]:
import nltk
import numpy as np
import pandas as pd
import random
import re
import spacy

from nltk.corpus import twitter_samples
from nltk.tag import pos_tag_sents
from sklearn.cluster import DBSCAN
from sklearn.neighbors import DistanceMetric

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/Live Tweet-Retweet/Gun Violence.csv')
df.head()

Unnamed: 0,Id,Tweet,Language,Created At,Topic,Query,Retweet Count,Tweet Id
0,6442dcb8ccf6c967bf6bf356,@VivekGRamaswamy And best case scenario for yo...,en,2023-04-21,Gun Violence,gun violence,0,1649487508047118336
1,6442dcbbccf6c967bf6bf35d,"Also, how the FUCK is this more important than...",en,2023-04-21,Gun Violence,gun violence,0,1649487290631110656
2,6442dcbeccf6c967bf6bf362,@wisgop *except from gun violence. Then they s...,en,2023-04-21,Gun Violence,gun violence,0,1649487205197402123
3,6442dcc0ccf6c967bf6bf365,End Gun Violence Epidemic https://t.co/z7y0siHcJw,en,2023-04-21,Gun Violence,gun violence,0,1649487166265851904
4,6442dcc1ccf6c967bf6bf368,No restrictions on who can possess guns or car...,en,2023-04-21,Gun Violence,gun violence,0,1649487062951686144


In [None]:
len(df)

11524

In [None]:
df = df.drop_duplicates(subset=['Tweet'])
df.reset_index(drop=True, inplace=True)
len(df)

7983

In [None]:
def data_cleaning(line):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                            "]+", flags = re.UNICODE)
    clean_text = regrex_pattern.sub(r'', line)
    clean_text = clean_text.replace('\r', '').replace('\n', ' ').replace('\n', ' ').lower() #remove \n and \r and lowercase
    clean_text = re.sub(r"(?:\@|https?\://)\S+", "", clean_text) #remove links and mentions
    clean_text = re.sub(r'[^\x00-\x7f]',r'', clean_text) #remove non utf8/ascii characters such as '\x9a\x91\x97\x9a\x97'
    banned_list= string.punctuation + 'Ã'+'±'+'ã'+'¼'+'â'+'»'+'§'
    table = str.maketrans('', '', banned_list)
    clean_text = clean_text.translate(table)
    new_tweet = " ".join(word.strip() for word in re.split('#(?!(?:hashtag)\b)[\w-]+(?=(?:\s+#[\w-]+)*\s*$)', clean_text)) #remove last hashtags
    new_tweet2 = " ".join(word.strip() for word in re.split('#|_', new_tweet)) #remove hashtags symbol from words in the middle of the sentence
    new_tweet2 = re.sub("\s\s+" , " ", new_tweet2)
    clean_text = new_tweet2.replace('$', '')
    clean_text = clean_text.replace('#', '')
    return clean_text

In [None]:
df['clean_text'] = df['Tweet'].apply(lambda func: data_cleaning(func))

In [None]:
import spacy.cli
spacy.cli.download("en_core_web_lg")

[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')


In [None]:
import spacy

In [None]:
nlp = spacy.load('en_core_web_lg')

In [None]:
from tqdm import tqdm
sent_vecs = {}
docs = []

for text in tqdm(df['clean_text']):
  doc = nlp(text)
  docs.append(doc)
  sent_vecs.update({text: doc.vector})

sentences = list(sent_vecs.keys())
vectors = list(sent_vecs.values())

100%|██████████| 7983/7983 [02:23<00:00, 55.76it/s]


In [None]:
x = np.array(vectors)

In [None]:
n_classes = {}

for i in tqdm(np.arange(0.001, 1, 0.002)):
  dbscan = DBSCAN(eps=i, min_samples=2, metric='cosine').fit(x)
  n_classes.update({i: len(pd.Series(dbscan.labels_).value_counts())})

dbscan = DBSCAN(eps=0.07, min_samples=2, metric='cosine').fit(x)

100%|██████████| 500/500 [13:42<00:00,  1.64s/it]


In [None]:
dbscan.labels_

array([ 0,  0,  0, ..., -1, -1,  0])

In [None]:
results = pd.DataFrame({'label': dbscan.labels_, 'sent': sentences})
results['label'].unique()

array([ 0, -1,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15,
       16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
       33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
       50, 51, 52, 53, 54])

In [None]:
eg_result = results[results.label == 0].sent.tolist()
event_df = df[df['clean_text'].isin(eg_result)][['Created At', 'Tweet']]
# event_df['Created At'] = pd.to_datetime(event_df['Created At'])
event_df = event_df.sort_values(by='Created At').dropna()

In [None]:
event_df

Unnamed: 0,Created At,Tweet
1058,2023-04-12,@drchethansathya @netflix @yularifilms @Northw...
947,2023-04-12,@ArtAcevedo @RealJamesWoods Does it hurt when ...
946,2023-04-12,@CalltoActivism It’s amazing of the amount of ...
1421,2023-04-12,@mgtd76 @TeslaHaxz @FenixAmmunition Thankfully...
1420,2023-04-12,@POTUS Maybe the U S government should stop st...
...,...,...
7460,2023-05-02,@NBCNews Texas Governor Caresless? What goings...
7461,2023-05-02,@Flightless223 @adorcharm @HoustonChron We hav...
7462,2023-05-02,@DawnsEcho @PleaseThink1776 @UnrealElHunter @T...
7372,2023-05-02,Baby boomer midwest democrats are interesting ...


In [None]:
res = [i for i in event_df['Tweet']]
res

["Mental illness + ANYTHING = mass murder. \n\nBombs. Automobiles. Fire. Poison. \n\nDictators. \n\nForceps... \n\nBut it's the guns. https://t.co/IJHe3zjb6m",
 "@StephenKing Mental illness + ANYTHING = mass murder. Bombs. Automobiles. Fire. Poison. Forceps... \n\nBut it's the guns."]