In [49]:
import TwitterFactCheck
import TwitterAPI
import importlib
import pandas as pd
import re
import os
import numpy as np
import json
import spacy
import logging
from collections import Counter
import networkx as nx
from networkx.algorithms import community
import altair as alt
from vega_datasets import data
import matplotlib.pyplot as plt
importlib.reload(TwitterAPI)
importlib.reload(TwitterFactCheck)
logging.basicConfig(level=logging.INFO)

# 1. Load the data and filtering

In [50]:
def find_loc(location):
    for city in geo["Cities"]:
        if city.lower() in location.lower():
            return geo["Cities"][city]
    for county in geo["Counties"]:
        if county.lower() in location.lower():
            return geo["Counties"][county]
    for key, val in geo["States"].items():
        if key in location or val.lower() in location.lower():
            return val
    return np.NaN

In [51]:
df_users = pd.read_csv("Data/SusUsers.csv", sep="\t", index_col="user_id")
with open("Data/GeoInfo.json", "r") as f:
    geo = json.loads(f.read())
df_users["state"] = df_users.location.astype(str).apply(find_loc)
df_loc = df_users.dropna(subset="state").groupby(["state"]).count().reset_index()
df_loc["id"] = df_loc.state.apply(geo["FIPS"].get).astype(int)

In [52]:
states = alt.topo_feature(data.us_10m.url, 'states')
alt.Chart(states).mark_geoshape().transform_lookup(
    lookup='id',
    from_=alt.LookupData(df_loc, 'id', ["username", "id"])
).encode(
    color="username:Q",
).project(type='albersUsa') 

# 3. Build the network and detect the communities

In [14]:
%%time
authors = set()
edges = list()
df_conn = pd.read_csv("Data/Network/NetworkTweets.csv", sep="\t", index_col="id")
df_conn["author1"] = df_conn["u1"].apply(lambda x: df_users.loc[x].username)
df_conn["author2"] = df_conn["u2"].apply(lambda x: df_users.loc[x].username)
for i, row in df_conn.iterrows():
    a1, a2 = row.author1, row.author2
    authors = authors | {a1, a2}
    edges.append([a1, a2])
author_dict = {author: i for i, author in enumerate(sorted(authors))}
edges = [(author_dict[a1], author_dict[a2]) for a1, a2 in edges]
edges = dict(Counter(edges))

CPU times: user 13.4 s, sys: 250 ms, total: 13.7 s
Wall time: 13.8 s


In [24]:
#  Build the community in unweighted graph
G = nx.Graph()
G.add_edges_from(edges.keys())
comm_uw = community.greedy_modularity_communities(G)
# plt.figure(figsize=(12,12))
# nx.draw(G, with_labels=True, node_size=500)
nx.write_gexf(G, "Graph/unweighted.gexf")

In [25]:
#  Build the community in weighted graph
G = nx.Graph()
for edge, weight in edges.items():
    G.add_edge(*edge, weight=weight)
comm = community.greedy_modularity_communities(G, "weights")
# plt.figure(figsize=(15,15))
# nx.draw(G, with_labels=True, node_size=500)
nx.write_gexf(G, "Graph/weighted.gexf")

# 4. Find someone popular

In [45]:
ct = Counter(df_conn.u1)
ct.update(df_conn.u2)
df_hot = pd.DataFrame(pd.Series(ct).sort_values(ascending=False), columns=["count"])
df_hot["username"] = df_users["username"]

In [48]:
df_hot.iloc[:10]

Unnamed: 0,count,username
20545835,42430,newsmax
2836421,38556,MSNBC
8953122,11376,PolitiFact
21925564,9022,PogueMoran
980526530,7666,hvnacuba76
1691491086,7552,Craig1454
520817923,6769,DimensioT
70394965,6749,citizentvkenya
632859279,6604,RadioCitizenFM
312149882,6028,Chris_1791
