In [None]:
# Install and Import Library

# !pip install pyspellchecker
# !pip install scattertext
# !pip install nltk

# Import Data Preprocessing and Wrangling libraries
import re
from tqdm.notebook import tqdm
import pandas as pd 
import numpy as np
from datetime import datetime

# Import NLP Libraries
import nltk
from spellchecker import SpellChecker

# Import Visualization Libraries
import matplotlib.pyplot as plt
import seaborn as sns 
import scattertext as st
from IPython.display import IFrame
from nltk.corpus import stopwords
import random 

# Downloading periphrals
nltk.download('vader_lexicon')
nltk.download('stopwords')
stop_words = stopwords.words('english')

# Remove distarcting warning
import warnings
warnings.filterwarnings('ignore')

# Intializate our tools 
sns.set_style('darkgrid')

# To identify misspelled words
spell = SpellChecker() 


def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    # lowercase all the letters
    temp = tweet.lower()
    
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    
    # remove hashtages and mentions
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    
    # remove links
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub(r"www.\S+", "", temp)
    
    # remove punctuations
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    
    # flitering non-alphanumeric characters
    temp = re.sub("[^a-z0-9]"," ", temp)
    
    # remove stop word
    temp = temp.split()
    temp = [w for w in temp if not w in stop_words]
    
    # put text into a list 
    temp = " ".join(word for word in temp)
    return temp

In [2]:
import pandas as pd
df = pd.read_csv('./data/prochoice_prolife.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,author_id,author_name,author_username,created_at,id,public_metrics,text,retweet_count,like_count,target
0,28588,73506221,Oregon Right to Life,OR_RTL,2022-06-23 00:00:06+00:00,1539760170900799490,"{'retweet_count': 5, 'reply_count': 0, 'like_c...",We know that unsupported pregnancies can gener...,5,13,1
1,28587,96631851,αιяgσ*мαтι¢*σρтιχ,sacraficial,2022-06-23 00:01:14+00:00,1539760456977395712,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",If you are murdered by a George W. Bush weapon...,0,0,1
2,28586,3041829701,skb,skb37027,2022-06-23 00:04:30+00:00,1539761277702213633,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",.@MarshaBlackburn so much for #ProLife. https:...,0,0,1
3,28537,177260708,Right To Life League,Right2LifeLg,2022-06-23 00:09:50+00:00,1539762620055552001,"{'retweet_count': 6, 'reply_count': 3, 'like_c...",Follow our instagram for more posts like this!...,6,19,1
4,27450,1523376591052582912,No Forced Birth,NoForcedBirth,2022-06-23 00:09:57+00:00,1539762647922421763,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Another classy tweet from the forced birth cro...,0,1,0


In [3]:
df = df[['created_at', 'text', 'retweet_count', 'like_count', 'target']]
df

Unnamed: 0,created_at,text,retweet_count,like_count,target
0,2022-06-23 00:00:06+00:00,We know that unsupported pregnancies can gener...,5,13,1
1,2022-06-23 00:01:14+00:00,If you are murdered by a George W. Bush weapon...,0,0,1
2,2022-06-23 00:04:30+00:00,.@MarshaBlackburn so much for #ProLife. https:...,0,0,1
3,2022-06-23 00:09:50+00:00,Follow our instagram for more posts like this!...,6,19,1
4,2022-06-23 00:09:57+00:00,Another classy tweet from the forced birth cro...,0,1,0
...,...,...,...,...,...
56035,2022-07-03 23:57:19+00:00,Next June is going to be more fun #ProLife htt...,1,3,1
56036,2022-07-03 23:57:37+00:00,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,0,1,1
56037,2022-07-03 23:57:58+00:00,At a #WomensRights and #prochoice rally https:...,0,0,0
56038,2022-07-03 23:58:20+00:00,Je refuse leur vaccin expérimental. \n#monCorp...,2,4,0


In [4]:
data = df.copy()
data['original_text'] = df['text']
data['datetime'] = df["created_at"]
data = data.drop('created_at', axis=1)
data['datetime'] = data['datetime'].astype(str).apply(lambda x: x.split('+')[0])
data['datetime'] = data.datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
data

Unnamed: 0,text,retweet_count,like_count,target,original_text,datetime
0,We know that unsupported pregnancies can gener...,5,13,1,We know that unsupported pregnancies can gener...,2022-06-23 00:00:06
1,If you are murdered by a George W. Bush weapon...,0,0,1,If you are murdered by a George W. Bush weapon...,2022-06-23 00:01:14
2,.@MarshaBlackburn so much for #ProLife. https:...,0,0,1,.@MarshaBlackburn so much for #ProLife. https:...,2022-06-23 00:04:30
3,Follow our instagram for more posts like this!...,6,19,1,Follow our instagram for more posts like this!...,2022-06-23 00:09:50
4,Another classy tweet from the forced birth cro...,0,1,0,Another classy tweet from the forced birth cro...,2022-06-23 00:09:57
...,...,...,...,...,...,...
56035,Next June is going to be more fun #ProLife htt...,1,3,1,Next June is going to be more fun #ProLife htt...,2022-07-03 23:57:19
56036,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,0,1,1,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,2022-07-03 23:57:37
56037,At a #WomensRights and #prochoice rally https:...,0,0,0,At a #WomensRights and #prochoice rally https:...,2022-07-03 23:57:58
56038,Je refuse leur vaccin expérimental. \n#monCorp...,2,4,0,Je refuse leur vaccin expérimental. \n#monCorp...,2022-07-03 23:58:20


In [None]:
tweets_text = df['text']
results = [clean_tweet(tw) for tw in tweets_text]
data['text'] = results
# tt = pd.DataFrame(results)
# tt.head()
#df.to_csv('clean.csv', index=False)
data

In [None]:
# Feature Extraction
data['words'] = data.text.apply(lambda x:re.findall(r'\w+', x ))
data['errors'] = data.words.apply(spell.unknown)
data['errors_count'] = data.errors.apply(len)
data['words_count'] = data.words.apply(len)
data['sentence_length'] = data.text.apply(len)
data['hour'] = data.datetime.apply(lambda x: x.hour)
data['date'] = data.datetime.apply(lambda x: x.date())
data['month'] = data.datetime.apply(lambda x: x.month)
data