In [8]:
# Install and Import Library

# !pip install pyspellchecker
# !pip install scattertext
# !pip install nltk

# Import Data Preprocessing and Wrangling libraries
import re
from tqdm.notebook import tqdm
import pandas as pd 
import numpy as np
from datetime import datetime

# Import NLP Libraries
import nltk

# Import Visualization Libraries
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import random 

# Downloading periphrals
nltk.download('stopwords')
stop_words = stopwords.words('english')

def clean_tweet(tweet):
    if type(tweet) == np.float:
        return ""
    # lowercase all the letters
    temp = tweet.lower()
    
    temp = re.sub("'", "", temp) # to avoid removing contractions in english
    
    # remove hashtages and mentions
    temp = re.sub("@[A-Za-z0-9_]+","", temp)
    temp = re.sub("#[A-Za-z0-9_]+","", temp)
    
    # remove links
    temp = re.sub(r'http\S+', '', temp)
    temp = re.sub(r"www.\S+", "", temp)
    
    # remove punctuations
    temp = re.sub('[()!?]', ' ', temp)
    temp = re.sub('\[.*?\]',' ', temp)
    
    # flitering non-alphanumeric characters
    temp = re.sub("[^a-z0-9]"," ", temp)
    
    # remove stop word
    temp = temp.split()
    temp = [w for w in temp if not w in stop_words]
    
    # put text into a list 
    temp = " ".join(word for word in temp)
    return temp

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/xingyuchen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
import pandas as pd
df = pd.read_csv('./data/prochoice_prolife.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,author_id,author_name,author_username,created_at,id,public_metrics,text,retweet_count,like_count,target
0,28588,73506221,Oregon Right to Life,OR_RTL,2022-06-23 00:00:06+00:00,1539760170900799490,"{'retweet_count': 5, 'reply_count': 0, 'like_c...",We know that unsupported pregnancies can gener...,5,13,1
1,28587,96631851,αιяgσ*мαтι¢*σρтιχ,sacraficial,2022-06-23 00:01:14+00:00,1539760456977395712,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",If you are murdered by a George W. Bush weapon...,0,0,1
2,28586,3041829701,skb,skb37027,2022-06-23 00:04:30+00:00,1539761277702213633,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",.@MarshaBlackburn so much for #ProLife. https:...,0,0,1
3,28537,177260708,Right To Life League,Right2LifeLg,2022-06-23 00:09:50+00:00,1539762620055552001,"{'retweet_count': 6, 'reply_count': 3, 'like_c...",Follow our instagram for more posts like this!...,6,19,1
4,27450,1523376591052582912,No Forced Birth,NoForcedBirth,2022-06-23 00:09:57+00:00,1539762647922421763,"{'retweet_count': 0, 'reply_count': 0, 'like_c...",Another classy tweet from the forced birth cro...,0,1,0


In [10]:
df = df[['created_at', 'text', 'retweet_count', 'like_count', 'target']]
df

Unnamed: 0,created_at,text,retweet_count,like_count,target
0,2022-06-23 00:00:06+00:00,We know that unsupported pregnancies can gener...,5,13,1
1,2022-06-23 00:01:14+00:00,If you are murdered by a George W. Bush weapon...,0,0,1
2,2022-06-23 00:04:30+00:00,.@MarshaBlackburn so much for #ProLife. https:...,0,0,1
3,2022-06-23 00:09:50+00:00,Follow our instagram for more posts like this!...,6,19,1
4,2022-06-23 00:09:57+00:00,Another classy tweet from the forced birth cro...,0,1,0
...,...,...,...,...,...
56035,2022-07-03 23:57:19+00:00,Next June is going to be more fun #ProLife htt...,1,3,1
56036,2022-07-03 23:57:37+00:00,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,0,1,1
56037,2022-07-03 23:57:58+00:00,At a #WomensRights and #prochoice rally https:...,0,0,0
56038,2022-07-03 23:58:20+00:00,Je refuse leur vaccin expérimental. \n#monCorp...,2,4,0


In [11]:
data = df.copy()
data['original_text'] = df['text']
data['datetime'] = df["created_at"]
data = data.drop('created_at', axis=1)
data['datetime'] = data['datetime'].astype(str).apply(lambda x: x.split('+')[0])
data['datetime'] = data.datetime.apply(lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S"))
data

Unnamed: 0,text,retweet_count,like_count,target,original_text,datetime
0,We know that unsupported pregnancies can gener...,5,13,1,We know that unsupported pregnancies can gener...,2022-06-23 00:00:06
1,If you are murdered by a George W. Bush weapon...,0,0,1,If you are murdered by a George W. Bush weapon...,2022-06-23 00:01:14
2,.@MarshaBlackburn so much for #ProLife. https:...,0,0,1,.@MarshaBlackburn so much for #ProLife. https:...,2022-06-23 00:04:30
3,Follow our instagram for more posts like this!...,6,19,1,Follow our instagram for more posts like this!...,2022-06-23 00:09:50
4,Another classy tweet from the forced birth cro...,0,1,0,Another classy tweet from the forced birth cro...,2022-06-23 00:09:57
...,...,...,...,...,...,...
56035,Next June is going to be more fun #ProLife htt...,1,3,1,Next June is going to be more fun #ProLife htt...,2022-07-03 23:57:19
56036,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,0,1,1,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,2022-07-03 23:57:37
56037,At a #WomensRights and #prochoice rally https:...,0,0,0,At a #WomensRights and #prochoice rally https:...,2022-07-03 23:57:58
56038,Je refuse leur vaccin expérimental. \n#monCorp...,2,4,0,Je refuse leur vaccin expérimental. \n#monCorp...,2022-07-03 23:58:20


In [12]:
tweets_text = df['text']
results = [clean_tweet(tw) for tw in tweets_text]
data['text'] = results

data

Unnamed: 0,text,retweet_count,like_count,target,original_text,datetime
0,know unsupported pregnancies generate lot fear...,5,13,1,We know that unsupported pregnancies can gener...,2022-06-23 00:00:06
1,murdered george w bush weapon war died free bo...,0,0,1,If you are murdered by a George W. Bush weapon...,2022-06-23 00:01:14
2,much,0,0,1,.@MarshaBlackburn so much for #ProLife. https:...,2022-06-23 00:04:30
3,follow instagram posts like,6,19,1,Follow our instagram for more posts like this!...,2022-06-23 00:09:50
4,another classy tweet forced birth crowd idea g...,0,1,0,Another classy tweet from the forced birth cro...,2022-06-23 00:09:57
...,...,...,...,...,...,...
56035,next june going fun,1,3,1,Next June is going to be more fun #ProLife htt...,2022-07-03 23:57:19
56036,im alarmed hear arrest made,0,1,1,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,2022-07-03 23:57:37
56037,rally,0,0,0,At a #WomensRights and #prochoice rally https:...,2022-07-03 23:57:58
56038,je refuse leur vaccin exp rimental,2,4,0,Je refuse leur vaccin expérimental. \n#monCorp...,2022-07-03 23:58:20


In [39]:
# Feature Extraction
import math
data['words'] = data.text.apply(lambda x:re.findall(r'\w+', x ))
data['words_count'] = data.words.apply(len)
data['sentence_length'] = data.text.apply(len)
# data['sentence_length_in_log'] = data.sentence_length.apply(np.log)
data['hour'] = data.datetime.apply(lambda x: x.hour)
data['date'] = data.datetime.apply(lambda x: x.date())
data['month'] = data.datetime.apply(lambda x: x.month)
# data['retweet_count_in_log'] = data.retweet_count.apply(np.log)
# data['like_count_in_log'] = data.like_count.apply(np.log)
data

Unnamed: 0,text,retweet_count,like_count,target,original_text,datetime,words,words_count,sentence_length,hour,date,month,sentence_length_in_log,positive_words_count,retweet_count_in_log,like_count_in_log
0,know unsupported pregnancies generate lot fear...,5,13,1,We know that unsupported pregnancies can gener...,2022-06-23 00:00:06,"[know, unsupported, pregnancies, generate, lot...",20,146,0,2022-06-23,6,4.983607,0,1.609438,2.564949
1,murdered george w bush weapon war died free bo...,0,0,1,If you are murdered by a George W. Bush weapon...,2022-06-23 00:01:14,"[murdered, george, w, bush, weapon, war, died,...",15,85,0,2022-06-23,6,4.442651,0,-inf,-inf
2,much,0,0,1,.@MarshaBlackburn so much for #ProLife. https:...,2022-06-23 00:04:30,[much],1,4,0,2022-06-23,6,1.386294,0,-inf,-inf
3,follow instagram posts like,6,19,1,Follow our instagram for more posts like this!...,2022-06-23 00:09:50,"[follow, instagram, posts, like]",4,27,0,2022-06-23,6,3.295837,0,1.791759,2.944439
4,another classy tweet forced birth crowd idea g...,0,1,0,Another classy tweet from the forced birth cro...,2022-06-23 00:09:57,"[another, classy, tweet, forced, birth, crowd,...",13,75,0,2022-06-23,6,4.317488,0,-inf,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
56035,next june going fun,1,3,1,Next June is going to be more fun #ProLife htt...,2022-07-03 23:57:19,"[next, june, going, fun]",4,19,23,2022-07-03,7,2.944439,0,0.000000,1.098612
56036,im alarmed hear arrest made,0,1,1,@Hammock_Thomas @MariaLaoise I'm alarmed to he...,2022-07-03 23:57:37,"[im, alarmed, hear, arrest, made]",5,27,23,2022-07-03,7,3.295837,0,-inf,0.000000
56037,rally,0,0,0,At a #WomensRights and #prochoice rally https:...,2022-07-03 23:57:58,[rally],1,5,23,2022-07-03,7,1.609438,0,-inf,-inf
56038,je refuse leur vaccin exp rimental,2,4,0,Je refuse leur vaccin expérimental. \n#monCorp...,2022-07-03 23:58:20,"[je, refuse, leur, vaccin, exp, rimental]",6,34,23,2022-07-03,7,3.526361,0,0.693147,1.386294


In [40]:
# preprocess_data = data[['retweet_count_in_log','like_count_in_log', 'words_count', 'sentence_length_in_log', 'hour', 'target']]
preprocess_data = data[['retweet_count','like_count', 'words_count', 'sentence_length', 'hour', 'target']]

preprocess_data

Unnamed: 0,retweet_count,like_count,words_count,sentence_length,hour,target
0,5,13,20,146,0,1
1,0,0,15,85,0,1
2,0,0,1,4,0,1
3,6,19,4,27,0,1
4,0,1,13,75,0,0
...,...,...,...,...,...,...
56035,1,3,4,19,23,1
56036,0,1,5,27,23,1
56037,0,0,1,5,23,0
56038,2,4,6,34,23,0


In [41]:
preprocess_data.describe()

Unnamed: 0,retweet_count,like_count,words_count,sentence_length,hour,target
count,56040.0,56040.0,56040.0,56040.0,56040.0,56040.0
mean,1.135046,5.561046,10.381103,67.771074,13.058547,0.510153
std,16.148907,98.445893,8.501427,55.495964,6.908186,0.499901
min,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,3.0,18.0,7.0,0.0
50%,0.0,0.0,9.0,59.0,15.0,1.0
75%,0.0,1.0,16.0,110.0,19.0,1.0
max,2009.0,17377.0,51.0,268.0,23.0,1.0


In [46]:
df = preprocess_data.drop('target', axis=1)
# normalized_df=(df-df.min())/(df.max()-df.min())
normalized_df=(df-df.mean())/df.std()
normalized_df

Unnamed: 0,retweet_count,like_count,words_count,sentence_length,hour
0,0.239332,0.075564,1.131445,1.409633,-1.890301
1,-0.070286,-0.056488,0.543308,0.310454,-1.890301
2,-0.070286,-0.056488,-1.103474,-1.149112,-1.890301
3,0.301256,0.136511,-0.750592,-0.734667,-1.890301
4,-0.070286,-0.046330,0.308054,0.130260,-1.890301
...,...,...,...,...,...
56035,-0.008363,-0.026015,-0.750592,-0.878822,1.439083
56036,-0.070286,-0.046330,-0.632965,-0.734667,1.439083
56037,-0.070286,-0.056488,-1.103474,-1.131093,1.439083
56038,0.053561,-0.015857,-0.515337,-0.608532,1.439083


In [47]:
final_result = pd.concat([preprocess_data['target'],normalized_df], axis=1)
final_result

Unnamed: 0,target,retweet_count,like_count,words_count,sentence_length,hour
0,1,0.239332,0.075564,1.131445,1.409633,-1.890301
1,1,-0.070286,-0.056488,0.543308,0.310454,-1.890301
2,1,-0.070286,-0.056488,-1.103474,-1.149112,-1.890301
3,1,0.301256,0.136511,-0.750592,-0.734667,-1.890301
4,0,-0.070286,-0.046330,0.308054,0.130260,-1.890301
...,...,...,...,...,...,...
56035,1,-0.008363,-0.026015,-0.750592,-0.878822,1.439083
56036,1,-0.070286,-0.046330,-0.632965,-0.734667,1.439083
56037,0,-0.070286,-0.056488,-1.103474,-1.131093,1.439083
56038,0,0.053561,-0.015857,-0.515337,-0.608532,1.439083


In [48]:
final_result.to_csv('clean.csv', index=False)