In [1]:
import json
import time
import pandas as pd
from TwitterFactCheck import TwitterFactCheck, DomainChecker
from PeakDetect import PeakDetector
from multiprocessing.dummy import Pool, Process

# 1. Initialization

In [2]:
with open("TwitterAPI.json", "r") as f:
    api_config = json.loads(f.read())["backup"]
    api = TwitterFactCheck(api_config)
df_cand = pd.read_csv("Data/Candidates/Candidates.csv", sep="\t").dropna(subset=["Position"])
df_sus_users = pd.read_csv("Data/Network/NetworkUsers.csv", sep="\t")
df_count = pd.read_csv("Data/Candidates/CandTweetsCount.csv", sep="\t").drop_duplicates(["Name", "Date"]).reset_index(drop=True)
checker = DomainChecker("Data/WebsiteCredibility.csv", "Data/UnrelatedWebsites.json")

# 2. Fetch new tweets

In [11]:
# time.sleep(6000)
fetch_status = dict()
start_time, end_time = "20220722", "20220727"
thread_fetch = Process(target=api.search_cand_tweets, kwargs={"status": fetch_status, "start_time": start_time, "end_time": end_time, "df_cand": df_cand})
thread_fetch.start()

In [10]:
fetch_status

{}

In [22]:
df_tweets = pd.concat(fetch_status["res"])
df_tweets.set_index(["id"]).drop(["withheld"], axis=1).to_csv("Data/Candidates/NewTweets.csv", sep="\t", index_label="id")

In [21]:
thread_fetch.is_alive()

False

In [20]:
fetch_status["i"]

710

# 3. Clean and save the tweets 

In [25]:
%%time
df_tweets = pd.read_csv("Data/Candidates/NewTweets.csv", sep="\t")
df_tweets = api.clean_tweets(df_tweets, df_cand, checker)
if "Id" in df_tweets.columns:
    df_tweets = df_tweets.set_index(["Id"])

  series = series.str.replace(r"(@[\w|\d]+|\#[\w|\d]+|https\S+)", " ")
  series = series.str.replace(s, "")
  return series.str.replace(r"\s+", " ")


CPU times: user 52min 28s, sys: 2.54 s, total: 52min 30s
Wall time: 52min 32s


In [60]:
%%time
# Save the cleaned tweets
df_tweets = df_tweets[df_tweets["Content"]!=""]
df_tweets.sort_values(["Date", "Name", "Id"]).to_csv("Data/Candidates/NewTweets.csv", sep="\t", index_label="Id")

In [27]:
# Update the collected tweets from suspicious users
df_sus_user_tweets = pd.read_csv("Data/Candidates/SusUserTweets.csv", sep="\t")
df_sus_user_tweets = pd.concat([df_sus_user_tweets, df_tweets[df_tweets["Author_id"].isin(df_sus_users["User_id"])]])
df_sus_user_tweets = df_sus_user_tweets.drop_duplicates(["Id"]).sort_values(["Date", "Name", "Id"]).drop(["Credibility"], axis=1)
df_sus_user_tweets.set_index(["Id"]).to_csv("Data/Candidates/SusUserTweets.csv", sep="\t", index_label="Id")

In [88]:
# Update the collected tweets from suspicious domain
df_sus_domain_tweets = pd.read_csv("Data/Candidates/SusDomainTweets.csv", sep="\t")
df_sus_domain_tweets = pd.concat([df_sus_domain_tweets, df_tweets[df_tweets["Credibility"]==0]])
df_sus_domain_tweets = df_sus_domain_tweets.drop_duplicates(["Id"]).sort_values(["Date", "Name", "Id"]).drop(["Credibility"], axis=1)
df_sus_domain_tweets.set_index(["Id"]).to_csv("Data/Candidates/SusDomainTweets.csv", sep="\t", index_label="Id")

# 4. Find the peaks with the newest data

In [91]:
%%time
df_tweets = pd.read_csv("Data/Candidates/NewTweets.csv", sep="\t")

CPU times: user 6.66 s, sys: 631 ms, total: 7.29 s
Wall time: 9.56 s


In [92]:
%%time
detector = PeakDetector(df_cand, df_sus_users, df_tweets, df_count)
df_count = detector()

CPU times: user 59.8 s, sys: 291 ms, total: 1min
Wall time: 60 s


In [95]:
df_count[df_count["Date"]==20220726]

Unnamed: 0,Name,Month,Date,TweetCount,SusUserCount,SusDomainCount,MonthTweetCount,TweetPeakIQR,SusUserPeakIQR,SusDomainPeakIQR
206,Aaron Del Mar,202207,20220726,0,0,0,0,0.0,0.0,0.0
413,Aaron Lieberman,202207,20220726,2,0,0,4,0.0,0.0,0.0
620,Aaron Sims,202207,20220726,5,0,0,17,0.0,0.0,0.0
827,Abigail Spanberger,202207,20220726,109,0,0,966,0.0,0.0,0.0
1034,Abraham Hamadeh,202207,20220726,782,0,1,7505,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
146348,Yuh-Line Niou,202207,20220726,981,0,1,5310,0.0,0.0,0.0
146555,Zach Conine,202207,20220726,3,0,0,19,0.0,0.0,0.0
146762,Zachary Varon,202207,20220726,0,0,0,1,0.0,0.0,0.0
146969,Zellnor Myrie,202207,20220726,0,0,0,0,0.0,0.0,0.0


In [9]:
df_count.set_index(["Name"]).to_csv("Data/Candidates/CandTweetsCount.csv", sep="\t",index_label="Name")

# 5. Analyze the relation with peaks and politicfacts

In [11]:
df_count = pd.read_csv("Data/Candidates/CandTweetsCount.csv", sep="\t")
df_pf = pd.read_csv("Data/PoliticFact.csv", sep="\t")

In [57]:
df_counts = df_count[df_count["Name"]=="Greg Abbott"]
plot_peak(df_counts, "SusUserCount")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_counts["Counts"] = df_counts[field] / df_counts[field].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_counts["Date"] = pd.to_datetime(df_counts["Date"].astype(str))
