In [1]:
from itertools import product
import json
import time
import pandas as pd
from TwitterFactCheck import TwitterFactCheck, DomainChecker
from PeakDetect import PeakDetector
from multiprocessing.dummy import Pool, Process

# 1. Initialization

In [2]:
with open("TwitterAPI.json", "r") as f:
    api_config = json.loads(f.read())["backup"]
    api = TwitterFactCheck(api_config)
df_cand = pd.read_csv("Data/Candidates/Candidates.csv", sep="\t").dropna(subset=["Position"])
df_sus_users = pd.read_csv("Data/Network/NetworkUsers.csv", sep="\t")
df_count = pd.read_csv("Data/Candidates/CandTweetsCount.csv", sep="\t").drop_duplicates(["Name", "Date"]).reset_index(drop=True)
checker = DomainChecker("Data/WebsiteCredibility.csv", "Data/UnrelatedWebsites.json")

# 2. Fetch new tweets

In [4]:
fetch_status = dict()
start_time, end_time = "20220701", "20220715"
thread_fetch = Process(target=api.search_cand_tweets, kwargs={"status": fetch_status, "start_time": start_time, "end_time": end_time, "df_cand": df_cand})
thread_fetch.start()

In [5]:
thread_fetch.is_alive()

True

In [None]:
fetch_status["i"]

In [None]:
df_tweets = pd.concat(fetch_status["res"])
df_tweets.set_index(["id"]).drop(["withheld"], axis=1).to_csv("Data/Candidates/NewTweets.csv", sep="\t", index_label="id")

# 3. Clean and save the tweets 

In [25]:
%%time
df_tweets = pd.read_csv("Data/Candidates/NewTweets.csv", sep="\t")
df_tweets = api.clean_tweets(df_tweets, df_cand, checker)
if "Id" in df_tweets.columns:
    df_tweets = df_tweets.set_index(["Id"])

  series = series.str.replace(r"(@[\w|\d]+|\#[\w|\d]+|https\S+)", " ")
  series = series.str.replace(s, "")
  return series.str.replace(r"\s+", " ")


CPU times: user 52min 28s, sys: 2.54 s, total: 52min 30s
Wall time: 52min 32s


In [60]:
%%time
# Save the cleaned tweets
df_tweets = df_tweets[df_tweets["Content"]!=""]
df_tweets.sort_values(["Date", "Name", "Id"]).to_csv("Data/Candidates/NewTweets.csv", sep="\t", index_label="Id")

In [31]:
df_sus_user_tweets = pd.read_csv("Data/Candidates/SusUserTweets.csv", sep="\t")

In [32]:
df_sus_user_tweets.shape

(225096, 10)

In [27]:
# Update the collected tweets from suspicious users
df_sus_user_tweets = pd.read_csv("Data/Candidates/SusUserTweets.csv", sep="\t")
df_sus_user_tweets = pd.concat([df_sus_user_tweets, df_tweets[df_tweets["Author_id"].isin(df_sus_users["User_id"])]])
df_sus_user_tweets = df_sus_user_tweets.drop_duplicates(["Id"]).sort_values(["Date", "Name", "Id"]).drop(["Credibility"], axis=1)
df_sus_user_tweets.set_index(["Id"]).to_csv("Data/Candidates/SusUserTweets.csv", sep="\t", index_label="Id")

In [33]:
df_sus_domain_tweets = pd.read_csv("Data/Candidates/SusDomainTweets.csv", sep="\t")

In [34]:
df_sus_domain_tweets.shape

(610426, 10)

In [88]:
# Update the collected tweets from suspicious domain
df_sus_domain_tweets = pd.read_csv("Data/Candidates/SusDomainTweets.csv", sep="\t")
df_sus_domain_tweets = pd.concat([df_sus_domain_tweets, df_tweets[df_tweets["Credibility"]==0]])
df_sus_domain_tweets = df_sus_domain_tweets.drop_duplicates(["Id"]).sort_values(["Date", "Name", "Id"]).drop(["Credibility"], axis=1)
df_sus_domain_tweets.set_index(["Id"]).to_csv("Data/Candidates/SusDomainTweets.csv", sep="\t", index_label="Id")

# 4. Find the peaks with the newest data

In [91]:
%%time
df_tweets = pd.read_csv("Data/Candidates/NewTweets.csv", sep="\t")

CPU times: user 6.66 s, sys: 631 ms, total: 7.29 s
Wall time: 9.56 s


In [92]:
%%time
detector = PeakDetector(df_cand, df_sus_users, df_tweets, df_count)
df_count = detector()

CPU times: user 59.8 s, sys: 291 ms, total: 1min
Wall time: 60 s


In [101]:
df_count.set_index(["Name"]).to_csv("Data/Candidates/CandTweetsCount.csv", sep="\t",index_label="Name")

In [12]:
df_count

Unnamed: 0,Name,Month,Date,TweetCount,SusUserCount,SusDomainCount,MonthTweetCount,TweetPeakIQR,SusUserPeakIQR,SusDomainPeakIQR
0,Aaron Del Mar,202207,20220101,0,0,0,1,0.0,0.0,0.0
1,Aaron Del Mar,202207,20220102,0,0,0,1,0.0,0.0,0.0
2,Aaron Del Mar,202207,20220103,0,0,0,1,0.0,0.0,0.0
3,Aaron Del Mar,202207,20220104,0,0,0,1,0.0,0.0,0.0
4,Aaron Del Mar,202207,20220105,0,0,0,1,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
147172,Zephyr Teachout,202207,20220722,2,0,0,11,0.0,0.0,0.0
147173,Zephyr Teachout,202207,20220723,0,0,0,11,0.0,0.0,0.0
147174,Zephyr Teachout,202207,20220724,2,0,0,11,0.0,0.0,0.0
147175,Zephyr Teachout,202207,20220725,3,0,0,11,0.0,0.0,0.0


# 5. Analyze the relation with peaks and politicfacts

In [1]:
from itertools import product
import json
import time
import pandas as pd
from TwitterFactCheck import TwitterFactCheck, DomainChecker
from PeakDetect import PeakDetector
from multiprocessing.dummy import Pool, Process
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

In [4]:
df_count = pd.read_csv("Data/Candidates/CandTweetsCount.csv", sep="\t")
df_pf = pd.read_csv("Data/PoliticFact.csv", sep="\t")

In [21]:
df_cand[df_cand["Name"]=="Joe Biden"]

Unnamed: 0,Name,Party,Twitter,State,Position
383,Joe Biden,Democratic,https://www.twitter.com/JoeBiden,Maine,Secretary of State


In [9]:
df_metrics = PeakDetector.get_metrics(df_count, df_pf)
df_metrics

Unnamed: 0,Unnamed: 1,f1,precision,recall,accuracy
Tweet,1.5,0.000848,0.000428,0.04717,0.919971
Tweet,3.0,0.000304,0.000154,0.009434,0.955267
Tweet,4.0,0.000388,0.000198,0.009434,0.965003
SusUser,1.5,0.008627,0.004669,0.056604,0.990631
SusUser,3.0,0.009852,0.005964,0.028302,0.995903
SusUser,4.0,0.005305,0.00369,0.009434,0.997452
SusDomain,1.5,0.015139,0.00831,0.084906,0.992044
SusDomain,3.0,0.014634,0.008403,0.056604,0.99451
SusDomain,4.0,0.014144,0.008319,0.04717,0.995264


In [29]:
df_pf[df_pf["Name"]!="Joe Biden"].dropna(subset="Name")

Unnamed: 0,CheckURL,Poster,Setting,Statement,CheckTime,Tags,Rate,Sources,iteration,Date,Name
1,https://www.politifact.com/factchecks/2021/nov...,Steve Kirsch,"stated on November 9, 2021 in an article:",“Gavin Newsom is out of sight likely because h...,"November 10, 2021",Facebook Fact-checks;California;Coronavirus;St...,False,"(ABC7 News, ""Where was Gov. Gavin Newsom? Here...",0,20211110,Gavin Newsom
5,https://www.politifact.com/factchecks/2021/nov...,Bloggers,"stated on November 7, 2021 in a blog post:",The military arrested “bedridden Gavin Newsom.”,"November 11, 2021",Fake news;Facebook Fact-checks;California;Coro...,Pants on Fire!,"(The New York Times, Where’s Gov. Gavin Newsom...",0,20211111,Gavin Newsom
10,https://www.politifact.com/factchecks/2021/nov...,Cindy Axne,"stated on November 5, 2021 in a tweet:",“Iowa ranks 45th in the U.S. for internet conn...,"November 12, 2021",Infrastructure;Technology;Iowa;Cindy Axne,True,(Email exchange and phone interview between Po...,0,20211112,Cindy Axne
16,https://www.politifact.com/factchecks/2021/nov...,Dan Patrick,"stated on October 18, 2021 in a TV interview:",Over a million people apprehended at the borde...,"November 15, 2021",Immigration;Texas;Dan Patrick,Half-True,"(Dan Patrick, Fox News America’s Newsroom inte...",0,20211115,Dan Patrick
52,https://www.politifact.com/factchecks/2021/nov...,Dave Loebsack,"stated on November 6, 2021 in a tweet:",Ashley Hinson’s and Mariannette Miller-Meeks’ ...,"November 22, 2021",Corrections and Updates;Technology;Iowa;Dave L...,Half-True,(Tweet by Dave Loebsack for Congress Twitter a...,0,20211122,Ashley Hinson
...,...,...,...,...,...,...,...,...,...,...,...
1002,https://www.politifact.com/factchecks/2022/jul...,Marjorie Taylor Greene,"stated on July 5, 2022 in a tweet:",Says an image shows the Highland Park shooting...,"July 5, 2022",Georgia;National;Crime;Facebook Fact-checks;Il...,False,"(Rep. Marjorie Taylor Greene on Twitter, July ...",2,20220705,Marjorie Taylor Greene
1003,https://www.politifact.com/factchecks/2022/jul...,Nikki Fried,"stated on June 28, 2022 in a campaign ad:",“Charlie Crist is pro-life.”,"July 5, 2022",Abortion;Florida;Nikki Fried,Mostly False,"(Nikki Fried, tweet, June 28, 2022)[https://mo...",2,20220705,Nikki Fried
1005,https://www.politifact.com/factchecks/2022/jul...,Viral image,"stated on July 4, 2022 in a Facebook post:",Says Marjorie Taylor Greene said “246 years ag...,"July 5, 2022",Facebook Fact-checks;Viral image,Pants on Fire!,"(Facebook post, July 4, 2022)[https://www.face...",2,20220705,Marjorie Taylor Greene
1010,https://www.politifact.com/factchecks/2022/jul...,Stephen King,"stated on July 6, 2022 in a tweet:",Florida Gov. Ron DeSantis “signs bill requirin...,"July 7, 2022",Corrections and Updates;Education;Florida;Priv...,False,"(Stephen King, Tweet, July 6, 2022 )[https://a...",2,20220707,Ron DeSantis


In [27]:
df_tmp = df_pf[df_pf["Name"]!="Joe Biden"].dropna(subset="Name")
df_tmp = df_tmp[df_tmp["Name"]=="Gavin Newsom"]
df_tmp

Unnamed: 0,CheckURL,Poster,Setting,Statement,CheckTime,Tags,Rate,Sources,iteration,Date,Name
1,https://www.politifact.com/factchecks/2021/nov...,Steve Kirsch,"stated on November 9, 2021 in an article:",“Gavin Newsom is out of sight likely because h...,"November 10, 2021",Facebook Fact-checks;California;Coronavirus;St...,False,"(ABC7 News, ""Where was Gov. Gavin Newsom? Here...",0,20211110,Gavin Newsom
5,https://www.politifact.com/factchecks/2021/nov...,Bloggers,"stated on November 7, 2021 in a blog post:",The military arrested “bedridden Gavin Newsom.”,"November 11, 2021",Fake news;Facebook Fact-checks;California;Coro...,Pants on Fire!,"(The New York Times, Where’s Gov. Gavin Newsom...",0,20211111,Gavin Newsom
259,https://www.politifact.com/factchecks/2022/jan...,Bloggers,"stated on January 3, 2022 in a blog post:",“Gavin Newsom Gets Death Penalty.”,"January 19, 2022",Fake news;Facebook Fact-checks;Bloggers,Pants on Fire!,"(Real Raw News, ""Gavin Newsom Gets Death Penal...",0,20220119,Gavin Newsom
934,https://www.politifact.com/factchecks/2022/jun...,Instagram posts,"stated on June 7, 2022 in an Instagram post:","""Gavin Newsom reportedly intervened at the req...","June 10, 2022",Congress;Facebook Fact-checks;California;Insta...,False,"(Instagram post, June 7, 2022)[https://www.ins...",1,20220610,Gavin Newsom


In [11]:
df_metrics = PeakDetector.get_metrics(df_count, df_pf[df_pf["Name"]!="Joe Biden"])
df_metrics

Unnamed: 0,Unnamed: 1,f1,precision,recall,accuracy
Tweet,1.5,0.000341,0.000171,0.032258,0.920233
Tweet,3.0,0.000306,0.000154,0.016129,0.955564
Tweet,4.0,0.000391,0.000198,0.016129,0.965301
SusUser,1.5,0.002972,0.001558,0.032258,0.990882
SusUser,3.0,0.00708,0.003976,0.032258,0.996188
SusUser,4.0,0.006006,0.00369,0.016129,0.997751
SusDomain,1.5,0.012227,0.006464,0.112903,0.992315
SusDomain,3.0,0.012887,0.007003,0.080645,0.994795
SusDomain,4.0,0.015083,0.008319,0.080645,0.995563


In [25]:
df_counts = df_count[df_count["Name"]=="Gavin Newsom"]
PeakDetector.plot_peak(df_counts, "SusDomainCount")
# Susdomain is better

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_count["Counts"] = df_count[field] / df_count[field].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_count["Date"] = pd.to_datetime(df_count["Date"].astype(str))
