In [22]:
import pandas as pd
import numpy as np

polluters = pd.read_csv('honeypot_data/content_polluters.txt', sep='\t', names=['UserID', 'CreatedAt', 'CollectedAt', 'NumerOfFollowings', 'NumberOfFollowers', 'NumberOfTweets', 'LengthOfScreenName', 'LengthOfDescriptionInUserProfile'], parse_dates=['CreatedAt', 'CollectedAt'])
polluters = polluters.set_index('UserID')

legit = pd.read_csv('honeypot_data/legitimate_users.txt', sep='\t', names=['UserID', 'CreatedAt', 'CollectedAt', 'NumerOfFollowings', 'NumberOfFollowers', 'NumberOfTweets', 'LengthOfScreenName', 'LengthOfDescriptionInUserProfile'], parse_dates=['CreatedAt', 'CollectedAt'])
legit = legit.set_index('UserID')



# User Account Information
## Longevity, Length of Name and Description

This section of code is used for the user account information. The below function takes in the two dataframes gathered from the data. Then parses them out for the user information that I will be using for this section. I parse through each dataframe first the first(polluters), appending on to a dictionary that has the index as well as the user information. This is also the case for the second dataframe. Both use the longevity of the users account, the length of the username, and the length of the bio/description for the account. 

The data that is placed in the dictionaries is then transformed back into dataframes, where the columns are renamed and the index from the original df are used as the index. 

Could I also have just deleted the columns from this, then added in the longevity? would this have been quicker?

In [31]:
#longevity of the account, maybe in days then look at the number of tweets over the number of active days

#Length of Screen Name, Length of bio

def setUserAccData(df1, df2):
    userDicPol = {}
    userDicLeg = {}
    data = []
    classify = []
    
    for index, row in df1.iterrows():
        strippedCreDay = pd.to_datetime(row[0])
        strippedColDay = pd.to_datetime(row[1])
        longevity = strippedColDay - strippedCreDay
        longevity = int(longevity.days)
        userList = [longevity, row[-2], row[-1]]
        
        data.append(userList)
        userDicPol[index] = userList
        classify.append(0) #polluters are positive
        
    for index, row in df2.iterrows():
        strippedCreDay = pd.to_datetime(row[0])
        strippedColDay = pd.to_datetime(row[1])
        longevity = strippedColDay - strippedCreDay
        longevity = int(longevity.days)
        userList = [longevity, row[-2], row[-1]]
        
        data.append(userList)
        userDicLeg[index] = userList
        classify.append(1) #legit users are neg
        
    userDFPol = pd.DataFrame.from_dict(userDicPol)
    userDFLeg = pd.DataFrame.from_dict(userDicLeg)
    
    userDFPol = userDFPol.T
    userDFLeg = userDFLeg.T
    
    #name the columns
    userDFPol = userDFPol.rename(columns={0:"Longevity of Account (Days)", 1:"Username Length", 2:"Bio Length"})
    userDFLeg = userDFLeg.rename(columns={0:"Longevity of Account (Days)", 1:"Username Length", 2:"Bio Length"})   

    return data, classify, userDFPol, userDFLeg
        

In [32]:
userAccData, userClassify, dfPol, dfLeg = setUserAccData(polluters, legit)

In [34]:
dfPol

Unnamed: 0,Longevity of Account (Days),Username Length,Bio Length
6301,1217,8,132
10836,1329,9,134
10997,1272,12,158
633293,1105,11,121
717883,1105,6,70
763068,1261,6,35
783705,1186,8,68
788352,1042,14,106
810011,1060,13,59
815299,1137,10,120


In [37]:
#For these dataframes, I need to make sure that I remove the accounts with no tweet data
#this will be used to compare other feature sets

dfPol.to_csv('created_data/userAccInfoPol.csv', encoding='utf-8')
dfLeg.to_csv('created_data/userAccInfoLeg.csv', encoding='utf-8')