In [1]:
import json
import time
import pandas as pd
from TwitterFactCheck import TwitterFactCheck, DomainChecker
from PeakDetect import PeakDetector
from multiprocessing.dummy import Pool, Process

# 1. Initialization

In [2]:
with open("TwitterAPI.json", "r") as f:
    api_config = json.loads(f.read())["backup"]
    api = TwitterFactCheck(api_config)
df_cand = pd.read_csv("Data/Candidates/Candidates.csv", sep="\t").dropna(subset=["Position"])
df_sus_users = pd.read_csv("Data/Network/NetworkUsers.csv", sep="\t")
df_count = pd.read_csv("Data/Candidates/CandTweetsCount.csv", sep="\t").drop_duplicates(["Name", "Date"]).reset_index(drop=True)
checker = DomainChecker("Data/WebsiteCredibility.csv", "Data/UnrelatedWebsites.json")

# 2. Fetch new tweets

In [3]:
fetch_status = dict()
start_time, end_time = "20220720", "20220722"
thread_fetch = Process(target=api.search_cand_tweets, kwargs={"status": fetch_status, "start_time": start_time, "end_time": end_time, "df_cand": df_cand})
thread_fetch.start()

In [15]:
df_tweets.set_index(["id"]).to_csv("Data/Candidates/NewTweets.csv", sep="\t", index_label="id")

In [6]:
thread_fetch.is_alive()

True

In [15]:
df_tweets = pd.concat(fetch_status["res"]).drop(["withheld"], axis=1)
df_tweets.to_csv("Data/Candidates/NewTweets.csv", sep="\t")

# 3. Clean and save the tweets 

In [16]:
%%time
df_tweets = pd.read_csv("Data/Candidates/NewTweets.csv", sep="\t")
df_tweets = api.clean_tweets(df_tweets,df_cand, checker)

  series = series.str.replace(r"(@[\w|\d]+|\#[\w|\d]+|https\S+)", " ")
  series = series.str.replace(s, "")
  return series.str.replace(r"\s+", " ")


CPU times: user 22min 3s, sys: 950 ms, total: 22min 4s
Wall time: 22min 4s


In [17]:
# Save the cleaned tweets
df_tweets.sort_values(["Date", "Name", "Id"]).to_csv("Data/Candidates/NewTweets.csv", sep="\t", index_label="Id")

In [18]:
# Update the collected tweets from suspicious users
df_sus_user_tweets = pd.read_csv("Data/Candidates/SusUserTweets.csv", sep="\t")
df_sus_user_tweets = pd.concat([df_sus_user_tweets, df_tweets[df_tweets["Author_id"].isin(df_sus_users["User_id"])]])
df_sus_user_tweets = df_sus_user_tweets.drop_duplicates(["Id"]).sort_values(["Date", "Name", "Id"]).drop(["Credibility"], axis=1)
df_sus_user_tweets.set_index(["Id"]).to_csv("Data/Candidates/SusUserTweets.csv", sep="\t", index_label="Id")

In [19]:
# Update the collected tweets from suspicious domain
df_sus_domain_tweets = pd.read_csv("Data/Candidates/SusDomainTweets.csv", sep="\t")
df_sus_domain_tweets = pd.concat([df_sus_domain_tweets, df_tweets[df_tweets["Credibility"]==0]])
df_sus_domain_tweets = df_sus_domain_tweets.drop_duplicates(["Id"]).sort_values(["Date", "Name", "Id"]).drop(["Credibility"], axis=1)
df_sus_domain_tweets.set_index(["Id"]).to_csv("Data/Candidates/SusDomainTweets.csv", sep="\t", index_label="Id")

# 4. Find the peaks with the newest data

In [20]:
%%time
df_tweets = pd.read_csv("Data/Candidates/NewTweets.csv", sep="\t")

CPU times: user 2.9 s, sys: 360 ms, total: 3.26 s
Wall time: 3.84 s


In [21]:
%%time
detector = PeakDetector(df_cand, df_sus_users, df_tweets, df_count)
df_count = detector()

CPU times: user 15.9 s, sys: 20 ms, total: 15.9 s
Wall time: 15.9 s


In [23]:
df_count.set_index(["Name"]).to_csv("Data/Candidates/CandTweetsCount.csv", sep="\t",index_label="Name")

# 5. Analyze the relation with peaks and politicfacts

In [11]:
df_count = pd.read_csv("Data/Candidates/CandTweetsCount.csv", sep="\t")
df_pf = pd.read_csv("Data/PoliticFact.csv", sep="\t")

In [57]:
df_counts = df_count[df_count["Name"]=="Greg Abbott"]
plot_peak(df_counts, "SusUserCount")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_counts["Counts"] = df_counts[field] / df_counts[field].max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_counts["Date"] = pd.to_datetime(df_counts["Date"].astype(str))
