In [20]:
%matplotlib inline

import re
import numpy as np
import statistics as stat
import datetime as dt
import zipfile
import os

#This file reads data from honeypot dataset, process and write the data into bots.txt and humans.txt

num_decimal = 4

In [21]:
#Reading basic info for each user into container
#account id ----
def read_info(filename, container,container2):
    with open(filename, 'r') as f:
        for line in f:
            tokens = re.split("[\t\n]", line)
            
            #Parse strings to numbers, take out the dates
            parsed_tokens = [tokens[0]] + [int(r) for r in tokens[3:8]]
            #Remove length of screen name from data, which is found to have negative effect on classification accuracy
            parsed_tokens = parsed_tokens[:4] + parsed_tokens[6:]
            #Add # followings / # followers
            parsed_tokens.append(round((parsed_tokens[1]/parsed_tokens[2] if parsed_tokens[2] else 0), num_decimal))
            container.append(parsed_tokens)
            container2.append([tokens[0]])

In [22]:
def read_followings(filename, container):
    with open(filename, 'r') as f:
        i = 0
        for line in f:
            tokens = [int(r) for r in re.split("[\D]", line) if r != ""][1:]
            #Calculate standard deviation of the data
            sd = round(stat.pstdev(tokens), num_decimal)
            #Calcalate standard deviation of differences of the data
            sdd = round(stat.pstdev(list(np.array(tokens[1:]) - np.array(tokens[:-1]))), num_decimal)
            #Calculate lag one autocorrelation of the data
            avg = np.mean(tokens)
            numerator = sum((np.array(tokens[1:]) - avg) * (np.array(tokens[:-1]) - avg))
            denominator = sum((np.array(tokens) - avg) ** 2)
            lac = round(numerator/denominator, num_decimal) if denominator != 0 else 0
            container[i] += [sd, sdd, lac]
            i += 1
            
            


In [23]:
#LDA------------------
def read_texts(filename,container):
    curr_userID = ""
    contents = ""
    i = 0
    
    with open(filename, encoding = 'utf-8', mode = 'r') as f:
        while True:
            line = f.readline()
            tokens = [r for r in re.split("[\t\n]", line) if r != ""]
            if not line or tokens[0] != curr_userID and curr_userID:
                #New user found / eof reached
                while curr_userID != container[i][0]:
                    i += 1  
                container[i].append(contents)
                if not line:
                    break
                contents = ""
            curr_userID = tokens[0]
            
            curr_content=tokens[2]
            # print(curr_userID)
            # print(curr_content)
            # print(tokens[1])
            contents+=curr_content
            
           
                
            #Reading tweets posted by each user


In [24]:
#Reading tweets posted by each user
def read_tweets(filename, container):
    curr_userID = ""
    curr_tweet_count = 0
    #Features contained in tweets (current user)
    urls, _at_s, hashtags, weekday_post = [], [], [], []
    #Index for container
    i = 0
    with open(filename, encoding = 'utf-8', mode = 'r') as f:
        while True:
            line = f.readline()
            tokens = [r for r in re.split("[\t\n]", line) if r != ""]
            if not line or tokens[0] != curr_userID and curr_userID:
                #New user found / eof reached
                num_tweets_weekday = [weekday_post.count(i) for i in range(7)]
                ratio_tweets_weekday = [round(x, num_decimal) for x in list(np.array(num_tweets_weekday) / len(weekday_post))]
                curr_user = num_tweets_weekday + ratio_tweets_weekday
                for feature in (urls, _at_s, hashtags):
                    curr_user.append(round(len(feature) / curr_tweet_count, num_decimal))
                    curr_user.append(round(len(set(feature)) / curr_tweet_count, num_decimal))
                while curr_userID != container[i][0]:
                    i += 1
                container[i] += curr_user
                if not line:
                    break
                #Reset current user info containers
                curr_tweet_count = 0
                urls, _at_s, hashtags, weekday_post = [], [], [], []
            #Post date of the tweet
            curr_userID = tokens[0]
            curr_tweet_count += 1
            # content
            curr_tweet_count += 1
            urls += re.findall('http[\S]+', tokens[2])
            _at_s += re.findall('@[\S]+', tokens[2])
            hashtags += re.findall('#[\S]+', tokens[2])
            post_date = re.split("[-\s]", tokens[3])
            post_date = dt.date(int(post_date[0]), int(post_date[1]), int(post_date[2]))
            weekday_post.append(post_date.weekday())



In [25]:
#Deleting ambiguous users who are in both polluters and legitimate users (44 found)
#The user ids are found in ascending order
def del_amb(bots, humans):
    i, j, count = (0, 0, 0)
    while i < len(bots) and j < len(humans):
        if bots[i][0] == humans[j][0]:
            bots.pop(i)
            humans.pop(j)
            count += 1
        elif int(bots[i][0]) < int(humans[j][0]):
            i += 1
        else:
            j += 1
    return count

#Add 0's for missing values (some users have no tweets recorded)
def add0(container):
    length = 29
    for i in range(len(container)):
        container[i] += [0]*(length - len(container[i]))
        
def add00(container):
    length = 2
    for i in range(len(container)):
        container[i] += [0]*(length - len(container[i]))  

In [26]:
#Write data into text files
def write_user(filename, container):
    with open(filename, 'w') as f:
        for inst in container:
            f.write("\t".join([str(x) for x in inst]))
            f.write("\n")


In [27]:
if not os.path.exists('social_honeypot_icwsm_2011'):
    if not os.path.exists('social_honeypot_icwsm_2011.zip'):
        print("downloading data")
        urlretrieve('http://infolab.tamu.edu/static/users/kyumin/social_honeypot_icwsm_2011.zip', 'social_honeypot_icwsm_2011.zip')
    zip = zipfile.ZipFile('social_honeypot_icwsm_2011.zip')
    zip.extractall(path = 'social_honeypot_icwsm_2011')
    zip.close()
    print("data ready")

In [28]:
print("executing")
bots, humans,botst,humanst = [], [],[],[]
read_info('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/content_polluters.txt', bots, botst)
print("data read from social_honeypot_icwsm_2011/content_polluters.txt")
read_info('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/legitimate_users.txt', humans,humanst)
print("data read from social_honeypot_icwsm_2011/legitimate_users.txt")

read_texts('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/content_polluters_tweets.txt', botst)
print("data read from social_honeypot_icwsm_2011/content_polluters_tweets.txt")
read_texts('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/legitimate_users_tweets.txt', humanst)
print("data read from social_honeypot_icwsm_2011/legitimate_users_tweets.txt")
'''
read_followings('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/content_polluters_followings.txt', bots)
print("data read from social_honeypot_icwsm_2011\content_polluters_followings.txt")
read_followings('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/legitimate_users_followings.txt', humans)
print("data read from social_honeypot_icwsm_2011\legitimate_users_followings.txt")

read_tweets('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/legitimate_users_tweets.txt', humans)
print("data read from social_honeypot_icwsm_2011/legitimate_users_tweets.txt")
read_tweets('social_honeypot_icwsm_2011/social_honeypot_icwsm_2011/content_polluters_tweets.txt', bots)
print("data read from social_honeypot_icwsm_2011/content_polluters_tweets.txt")
'''
count = del_amb(bots, humans)
print("%d mislabeled users deleted!" % count)
add0(bots)
add0(humans)
add00(botst)
add00(humanst)


    


print("added 0's for missing values")

write_user('botst.txt',botst)
print("data written to botst.txt")
write_user('humanst.txt', humanst)
print("data written to humanst.txt")

write_user('bots.txt', bots)
print("data written to bots.txt")
write_user('humans.txt', humans)
print("data written to humans.txt")

executing
data read from social_honeypot_icwsm_2011/content_polluters.txt
data read from social_honeypot_icwsm_2011/legitimate_users.txt
data read from social_honeypot_icwsm_2011/content_polluters_tweets.txt
data read from social_honeypot_icwsm_2011/legitimate_users_tweets.txt
44 mislabeled users deleted!
added 0's for missing values
data written to botst.txt
data written to humanst.txt
data written to bots.txt
data written to humans.txt


In [3]:
botdic = {}
print("executed")
with open("botst.txt", 'r') as f:
    s = f.readline()
    while s:
        p=s.split("	")
        botdic[p[0]]=p[1]
        s = f.readline()


executed


In [16]:
humandic = {}
print("executed")
with open("humanst.txt", 'r') as f:
    s = f.readline()
    while s:
        p=s.split("	")
        humandic[p[0]]=p[1]
        s = f.readline()

executed


In [66]:
# load data
# import sys
# !{sys.executable} -m pip install nltk


import nltk
# nltk.download('all')
print("executed")
for key in botdic:
    text = botdic[key]
    # print(botdic[key])
    # split into words
    # print("before")
    # print(text)
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    
    botdic[key]=stemmed
# print('afterprocessing:')   



executed


TypeError: expected string or bytes-like object

In [65]:
import nltk
# nltk.download('all')
print("executed")
for key in humandic:
    text = humandic[key]
    # print(botdic[key])
    # split into words
    # print("before")
    # print(text)
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(text)
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    from nltk.stem.porter import PorterStemmer
    porter = PorterStemmer()
    stemmed = [porter.stem(word) for word in tokens]
    
    humandic[key]=stemmed
# print('afterprocessing:')   



executed


TypeError: expected string or bytes-like object

In [79]:
#lda distribution
import gensim
from gensim.test.utils import common_texts
from gensim.corpora.dictionary import Dictionary
# Create a corpus from a list of texts
common_dictionary = Dictionary(common_texts)
common_corpus = [common_dictionary.doc2bow(text) for text in common_texts]

# print(botdic["788352"])
# Train the model on the corpus.
lda = gensim.models.ldamodel.LdaModel(common_corpus, num_topics=15, alpha=0.3,eta=0.01)  # learn asymmetric alpha from data
bot_corpus={}
for text in botdic:
    dist = common_dictionary.doc2bow(botdic[text])
    bot_corpus[text]=lda[dist]
#bot_corpus = [common_dictionary.doc2bow(botdic[text]) for text in botdic]
#human_corpus = [common_dictionary.doc2bow(humandic[text]) for text in humandic]
human_corpus={}
for text in humandic:
    dist = common_dictionary.doc2bow(humandic[text])
    human_corpus[text]=lda[dist]
# print(lda[other_corpus])

print(bot_corpus["788352"])


[(0, 0.013953499), (1, 0.013953499), (2, 0.013953499), (3, 0.013953499), (4, 0.014064783), (5, 0.013953499), (6, 0.013953499), (7, 0.013953499), (8, 0.013953499), (9, 0.80351615), (10, 0.013953499), (11, 0.013953499), (12, 0.013953499), (13, 0.013953499), (14, 0.01497709)]


In [190]:
print(bot_corpus["15742514"])

[(9, 0.8839612)]


In [82]:
print(len(bot_corpus))

22223


In [186]:
#GOSS AND LOSS 
#GOSS 
def goss_val(corpus):
    import math
    goss =[]
    from collections import defaultdict

    goss = defaultdict(dict)


    for k in range(15):
        u=0
        d=0
        for i in corpus:
            for j,p in corpus[i]:
                if j==k:
                    #u+=bot_corpus[i][k][1]
                    u+=p
                    break



        u/=len(corpus)
        for i in corpus:
            for j,p in corpus[i]:
                if j==k:
                    #u+=bot_corpus[i][k][1]
                    d+=(p-u)*(p-u)
                    break
            '''
            if bot_corpus[i][j]:
                for j in range(k):
                    if bot_corpus[i][j][0]==k:
                        d+=(bot_corpus[i][j][1]-u)*(bot_corpus[i][j][1]-u)
                        break
                    j+=1
            '''

        math.sqrt(d)
        for i in corpus:
            '''
            if bot_corpus[i][j]:
                for j in range(k):
                    if bot_corpus[i][j][0]==k:
                        goss[i][k] = (bot_corpus[i][j][1]-u)/d
                        break
                    j+=1
            '''
            for j,p in corpus[i]:
                if j==k:
                    #u+=bot_corpus[i][k][1]
                    goss[i][k] = (p-u)/d
                    break
                else: goss[i][k]=0.00
    print("goss for this corpus is calculated")
    return goss

    #print(goss["788352"])
    




In [187]:
human_goss=goss_val(human_corpus)
bot_goss = goss_val(bot_corpus)
    

goss for this corpus is calculated
goss for this corpus is calculated


In [188]:
#LOSS
def loss_val(corpus):
    import math
    goss =[]
    from collections import defaultdict

    loss = defaultdict(dict)


    for i in corpus:
        u=0
        d=0
        for k in range(15):
            for j,p in corpus[i]:
                if j==k:
                    #u+=bot_corpus[i][k][1]
                    u+=p
                    break
        u/=15
        for k in range(15):
            for j,p in corpus[i]:
                if j==k:
                    #u+=bot_corpus[i][k][1]
                    d+=(p-u)*(p-u)
                    break
            '''
            if bot_corpus[i][j]:
                for j in range(k):
                    if bot_corpus[i][j][0]==k:
                        d+=(bot_corpus[i][j][1]-u)*(bot_corpus[i][j][1]-u)
                        break
                    j+=1
            '''

        math.sqrt(d)
        for k in range(15):
            '''
            if bot_corpus[i][j]:
                for j in range(k):
                    if bot_corpus[i][j][0]==k:
                        goss[i][k] = (bot_corpus[i][j][1]-u)/d
                        break
                    j+=1
            '''
            for j,p in corpus[i]:
                if j==k:
                    #u+=bot_corpus[i][k][1]
                    loss[i][k] = (p-u)/d
                    break
                else: loss[i][k]=0.00
    print("loss for this corpus is calculated")
    return loss




In [189]:
human_loss=loss_val(human_corpus)
bot_loss = loss_val(bot_corpus)



loss for this corpus is calculated
loss for this corpus is calculated


In [191]:
print(bot_goss["788352"][0])
print(bot_loss["15742514"])


-0.0013005006667086874
{0: 0.0, 1: 0.0, 2: 0.0, 3: 0.0, 4: 0.0, 5: 0.0, 6: 0.0, 7: 0.0, 8: 0.0, 9: 1.2120764696041115, 10: 0.0, 11: 0.0, 12: 0.0, 13: 0.0, 14: 0.0}


In [None]:
human_gl = {}
for i in bot_goss:
    human_gl[i] = bot_goss[i]
    human_gl[i].

In [158]:
def dict_list(dict):
    dic={}
    for i in dict:
        dic[i]=[]
        for value in dict[i]:
            dic[i].append(dict[i][value])
    print("converted")
    return dic
    

In [160]:
bot_gos = dict_list(bot_goss)
bot_los = dict_list(bot_loss)
human_gos = dict_list(human_goss)
human_los = dict_list(human_loss)

converted
converted
converted
converted


In [192]:
print(bot_gos["15742514"])

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0.0006215511769755345, 0, 0, 0, 0, 0]


In [289]:
# Build dataset
#loss
#let human = 1, bot = 0
loss_set = []
loss_target = []

for i in human_los:
    #for k in human_loss[i]:
    loss_set.append(human_los[i])
    loss_target.append(1)

for j in bot_los:
    # for m in bot_loss[j]:
    loss_set.append(bot_los[j])
    loss_target.append(0.00)
        
goss_set = []
goss_target = []
for i in human_gos:
    # for k in human_goss[i]:
    goss_set.append(human_gos[i])
    goss_target.append(1)    
for j in bot_gos:
    # for m in bot_goss[j]:
    goss_set.append(bot_gos[j])
    goss_target.append(0.00)
 #loss_goss_set = defaultdict(dict)




In [290]:
print(goss_set[1])
print(goss_target[1])

[0.0007240364726450916, -4.58463984862702e-05, 0.0012004149478232339, 0.001200414954446007, -0.0001052071895827132, 0.0012004149544390505, 0.001197882486843453, 0.0009515685166071788, 0.001200414959833163, -7.09759969684995e-05, 0.0012004148215825155, 0.0012004148888977004, 0.0012004149425423052, 0.001200414721438811, 9.23785369020726e-05]
1


In [291]:
print(len(goss_set))
print(len(goss_target))
# print(loss_set[25426])


63695
63695


In [348]:
#Classifier: SVM 

    import numpy as np
    from sklearn.model_selection import train_test_split
    from sklearn import svm
    # human: X; bot: Y
    los_set = np.array(loss_set)
    np.nan_to_num(los_set)
    X_train, X_test, y_train, y_test = train_test_split(
    los_set,loss_target, test_size=0.4, random_state=0)
    
    for i in range(len(X_train)):
        while len(X_train[i])!= 15:
            X_train[i].append(0)
            print(X_train[i])
    print("0 filled")
        




    for i in range(len(X_train)):
        for j in range(len(X_train[i])):
            if np.isnan(X_train[i][j]) or not np.isfinite(X_train[i][j]):
                X_train[i][j]=0
                

    for i in range(len(X_test)):
        for j in range(len(X_test[i])):
            if np.isnan(X_test[i][j]) or not np.isfinite(X_test[i][j]):
                X_test[i][j]=0

        '''
        .append(0.00)
            print("added")
    for i in range(len(X_test)):
        while len(X_test[i])!=15:
            X_test[i].append(0.00)
            # print("added")
    '''

    print(X_train.shape)
    #SVM
    
    print(np.any(np.isnan(X_train)))
    print(np.all(np.isfinite(X_train)))

    print(np.any(np.isnan(y_train)))
    print(np.all(np.isfinite(y_train)))
    print(np.any(np.isnan(X_test)))
    print(np.all(np.isfinite(X_test)))
    print(np.any(np.isnan(y_test)))
    print(np.all(np.isfinite(y_test)))
    # Train it on the entire training data set
    

0 filled
(38208, 15)
False
True
False
True
False
True
False
True


In [349]:
classifier.fit(X_train, y_train)

    # Get predictions on the test set
y_pred = classifier.predict(X_test)
classifier = svm.SVC(kernel="linear")



    #  LOSS
from sklearn.metrics import f1_score,precision_score, recall_score
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))



0.7108059813787266
0.7996402782553496
0.9138504322592347


In [293]:
svmclassifier(loss_set,loss_target)
# svmclassifier(goss_set,goss_target)

0 filled
(38208, 15)
False
True
False
True
False
True
False
True




0.7108059813787266
0.7996402782553496
0.9138504322592347


In [315]:
from sklearn.neural_network import MLPClassifier

clf = MLPClassifier(solver='lbfgs', alpha=1e-5,
                     hidden_layer_sizes=(20, 5), random_state=1)

clf.fit(X_train, y_train)                         

MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 5), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=1, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [316]:
y_pred= clf.predict(X_test)
print('----mlp-----')
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
#0.7176001537131329 0.7997537407318183 0.9031497491082764
#0.7178894665896399 0.7992924717926727 0.9015174415089777

0.7151570019535903
0.7998827542101897
0.9073816576990509


In [319]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
clfrf=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
clfrf.fit(X_train, y_train) 
y_pred= clfrf.predict(X_test)
print('----randomforest-----')
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))
print('----adaboost------')
clfad=RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1)
clfad.fit(X_train, y_train) 
y_pred= clfad.predict(X_test)
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))

----randomforest-----
0.7241638293611056
0.7922109933448361
0.8743727706910103
----adaboost------
0.7247665408305186
0.7957352820876393
0.8821111178284263


In [321]:
from sklearn.svm import SVC
clfr = SVC(gamma=2, C=1)
clfr.fit(X_train, y_train)
y_pred= clfr.predict(X_test)
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))



0.7150835322195704
0.7991784694993465
0.905688894262741


In [322]:
from sklearn.neighbors import KNeighborsClassifier
clfk=KNeighborsClassifier(2)
clfk.fit(X_train, y_train)
y_pred= clfk.predict(X_test)
print(precision_score(y_test, y_pred))
print(f1_score(y_test, y_pred))
print(recall_score(y_test, y_pred))


0.7346200912226054
0.7652017729366436
0.7984402394051145


In [339]:
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
    # human: X; bot: Y
gos_set = np.array(goss_set)
np.nan_to_num(gos_set)
X_train, X_test, y_train, y_test = train_test_split(
gos_set,goss_target, test_size=0.4, random_state=0)
    
for i in range(len(X_train)):
     while len(X_train[i]) < 15:
        X_train[i].append(0.00)
        print(X_train[i])
        print("0 filled")
    

In [343]:
for i in range(len(X_train)):
    for j in range(15):
        try:
            if np.isnan(X_train[i][j]) or not np.isfinite(X_train[i][j]):
                X_train[i][j]=0
        except IndexError:
            X_train[i].append(0.00)
            
                

for i in range(len(X_test)):
    for j in range(15):
        try:
            if np.isnan(X_test[i][j]) or not np.isfinite(X_test[i][j]):
                X_test[i][j]=0
        except IndexError:
            X_test[i].append(0.00)

In [344]:
    print(X_train.shape)
    #SVM
    classifier = svm.SVC()
    '''
    print(np.any(np.isnan(X_train)))
    print(np.all(np.isfinite(X_train)))

    print(np.any(np.isnan(y_train)))
    print(np.all(np.isfinite(y_train)))
    print(np.any(np.isnan(X_test)))
    print(np.all(np.isfinite(X_test)))
    print(np.any(np.isnan(y_test)))
    print(np.all(np.isfinite(y_test)))
    # Train it on the entire training data set
    '''
    classifier.fit(X_train, y_train)

    # Get predictions on the test set
    y_pred = classifier.predict(X_test)

(38217,)


ValueError: setting an array element with a sequence.