In [1]:
"""
Data analysis of porn bots that appeared on 
Swedish Twitter in mid January.
Written by https://github.com/christopherkullenberg/
Free to use without any permission, but linking back is nice.
The data loaded is not included in this repository because
it may contain personal data. 
"""

import pandas as pd
import matplotlib.pyplot as plt
import networkx as nx
import re
import sqlite3
from collections import Counter

def messagesperday(dataframe):
    messagesperday = dataframe.resample('D').count()
    fig = plt.figure(figsize=(100,50))
    ax = fig.add_subplot(111)
    plt.title('Meddelanden per dag', size=16)
    messagesperday['id'].plot()
    plt.xlabel('Datum')
    plt.ylabel('Meddelanden')
    #novelfilename = str(time.time())
    #plt.savefig("../results/" + str(novelfilename) + "perday.png")
    #return(str(novelfilename))
    plt.show()

def messagesperhour(dataframe):
    hours = dataframe.index.hour
    hours.mean()
    fig = plt.figure(figsize=(30,15))
    ax = fig.add_subplot(111)
    plt.title('Meddelanden per timme', size=16)
    plt.plot(hours, 'ro') #Plot each individual message per day
    plt.xlabel('Meddelande')
    plt.ylabel('Klockslag')
    #novelfilename = str(time.time())
    plt.show()
    #plt.savefig("../results/" + str(novelfilename) + "perhour.png")
    #return(str(novelfilename))

def makementionsnetwork(dataframe):
    G = nx.DiGraph()
    for tweet in dataframe.iterrows():
        match = re.findall("(?<=@).*?(?=[\s|\:])", tweet[1][4], re.IGNORECASE)
        if match:
            for m in match:
                #print("Source: " + tweet[1][2])
                #print("Target: " + m)
                G.add_edge(tweet[1][2], m)
    nx.write_gexf(G, "porrbotmentionsnetwork.gexf")
    
def hashtaganalyser(dataframe):
    hashtaglist = []
    for tweet in dataframe.iterrows():
        match = re.findall("(?<=#).*?(?=[\s|\:])", tweet[1][4], re.IGNORECASE)
        if match:
            for m in match:
                hashtaglist.append("#" + m)
    return(hashtaglist)

def urlanalyser(dataframe):
    urllist = []
    for tweet in dataframe.iterrows():
        match = re.findall("(?<=http).*?(?=[\s])", tweet[1][4], re.IGNORECASE)
        if match:
            for m in match:
                urllist.append(m)
    return(urllist)
    
        
def search(df, regexp):
    results = []
    for d in df.iterrows():
        searchexp = re.findall(regexp, d[1][5], re.IGNORECASE)
        if searchexp:
            for s in searchexp:
                results.append(("https://twitter.com/user/status/" + str(d[1][1]), d[1][5]))
    return results


In [4]:
conn = sqlite3.connect("porrtweets.sqlite3")
sql = "SELECT * FROM users;"
df = pd.read_sql_query(sql, conn)
pd.created_at = pd.to_datetime(df.timestamp, format='%Y-%m-%d %H:%M:%S')
df.index = pd.created_at
print("Number of tweets in dataframe: " + str(len(df)))

Number of tweets in dataframe: 256397


In [5]:
df.head()

Unnamed: 0_level_0,id,tweetid,user,timestamp,text
timestamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2018-01-07 08:43:06,1,949924340186931200,VeritasPharma,2018-01-07 08:43:06,@personperson923 what do you think is going to...
2018-01-13 03:12:04,2,952015363205681152,SmthnglikeKites,2018-01-13 03:12:04,@FoxSeeOne @Ness_Divad18 @personperson923 @Sub...
2018-01-13 05:56:23,3,952056711963140097,itstimwhite,2018-01-13 05:56:23,@personperson923 thanksss for following \nwoul...
2014-05-06 20:08:11,4,463772234331353088,jkell150,2014-05-06 20:08:11,Pretty dam good week so far and gonna be a goo...
2014-06-09 13:26:44,5,475992390298525698,jkell150,2014-06-09 13:26:44,Pretty damn good weekend with @TrishaKellogg @...


In [16]:
# messagesperday(df)

In [17]:
# messagesperhour(df)

In [8]:
makementionsnetwork(df)

In [9]:
len(hashtaganalyser(df))

66785

In [10]:
hashfreq = Counter(hashtaganalyser(df)).most_common(30)

In [11]:
for f in hashfreq:
    print(f)

('#FF', 498)
('#', 253)
('#androidgames,', 171)
('#NowPlaying', 149)
('#fashion', 143)
('#1', 129)
('#iHeartAwards', 127)
('#travel', 127)
('#love', 123)
('#ff', 119)
('#android,', 116)
('##golfing', 116)
('#BestFanArmy', 115)
('#tl', 113)
('#factsdaily', 105)
('#TuitUtilhttp', 103)
('#np', 102)
('#WhatShouldPlayNext', 100)
('#Periscope', 94)
('#VELVETloves', 92)
('#Giveaway', 88)
('#Android', 88)
('#iphonegames,', 88)
('#SupplyCrate', 82)
('#CSRClassics', 80)
('#フルボッコ', 80)
('#factsoftheday', 80)
('#garrisonpark', 78)
('#SoundCloud', 76)
('#sumobun', 76)


In [12]:
urlfreq = Counter(urlanalyser(df)).most_common(30)

In [15]:
for url in urlfreq:
    #print("http" + url[0])
    print("http" + url[0], url[1])

http://fllwrs.com 572
http://bnent.jp/optc-den2e/ 380
http://uapp.ly 293
https://t.lnk.to/spTw 279
http://gigam.es/Castlez_tw 200
http://bit.ly/yk3b9m 154
http://www.tuitutil.net 126
http://bit.ly/xHSyh0 113
http://slfy.co/Jih9 103
http://twcm.me/HXQ38 101
http://bit.ly/1mH9KJj 100
http://bit.ly/whBNNw 98
http://bit.ly/zzEL3G 98
http://ow.ly/7wAx30h5FET 97
http://bnent.jp/optc-kr/ 97
http://www.crowdfireapp.com/?r=tw 96
http://goo.gl/fP9AGJ 96
http://gigam.es/imtw_Tribez 93
http://youtu.be/Q57ZGIJRF4o 92
https://twitch.tv/ 87
http://unfollowspy.com 84
http://bit.ly/NtcNTs 76
http://bit.ly/AzhyKP 76
http://nmgam.es/cct_gp 73
http://goo.gl/ePvvZt 73
http://goo.gl/LLAA8j 73
http://rekkacopy.com 70
http://apple.co/1FQ0RtQ 70
http://amzn.to/1Cg48yF 70
http://www45.atwiki.jp/dendlightning/ 69


In [14]:
len(urlanalyser(df))

63025