In [138]:
import pandas as pd
import tweepy
import re
import numpy as np
import os
import PIL
import requests
from sklearn.utils import shuffle
from PIL import Image

#### Scraping user data for mixed datasets

In [2]:
df_genuine = pd.read_csv("users_genuine.csv")
df_rtbust = pd.read_csv("cresci-rtbust-2019.csv", header = None)
df_gilani = pd.read_csv("gilani-2017.csv", header = None)

In [3]:
new_df = df_genuine['id'].to_frame()
new_df['account_type'] = 'human'

df_rtbust.columns = ['id', 'account_type']
df_gilani.columns = ['id', 'account_type']

new_df1 = pd.concat([new_df, df_rtbust])
new_df2 = pd.concat([new_df1, df_gilani])
new_df2 = new_df2.reset_index(drop = True)

In [4]:
new_df2 = new_df2.drop_duplicates(subset=['id'], keep='first')
df_twitter = new_df2.reset_index(drop = True)
df_twitter

Unnamed: 0,id,account_type
0,1.502026e+09,human
1,2.492782e+09,human
2,2.932123e+08,human
3,1.918397e+08,human
4,3.020965e+09,human
...,...,...
6854,6.876519e+07,human
6855,7.824970e+17,human
6856,1.341644e+08,human
6857,7.237812e+07,human


In [5]:
df_twitter['id'] = df_twitter['id'].astype(int).astype(str)
df_twitter['id'] = df_twitter['id'].str.replace('-','')
df_twitter

Unnamed: 0,id,account_type
0,1502026416,human
1,2147483648,human
2,293212315,human
3,191839658,human
4,2147483648,human
...,...,...
6854,68765193,human
6855,2147483648,human
6856,134164413,human
6857,72378123,human


In [6]:
# name, username, created_at, protected, verified, location, description, 
# public_metrics (followers_count, following_count, tweets_count)
# id, tweets, account_type
# CHECK
# favourites_count, geo_enabled, statuses_count
# average_tweets_per_day, account_age_days

def get_twitter_dictionary(id_list, account_type_dict, client):
    twitter_data = {'id':[], 'name':[], 'username':[], 'created_at':[], 
                    'protected':[], 'verified':[], 'url': [], 'profile_image_url':[],
                    'location':[], 'description':[], 'followers_count':[], 'following_count':[],
                    'tweet_count':[], 'listed_count':[], 'account_type':[]}

    user_info = client.get_users(ids = id_list, user_fields=["created_at", "description", "entities", 
                                                             "id", "location", "name", "profile_image_url", 
                                                             "protected", "public_metrics", "url", "username", 
                                                             "verified", "withheld"])
    
    user_dict = dict()
    info_list = user_info[0]
    
    if info_list == None:
        return twitter_data
    
    for each_user in info_list:
        data = each_user.data
        user_dict[data['id']] = data

    for item in user_dict.items():
        each_user_id = item[0]
        each_user_account_type = account_type_dict[each_user_id]
        each_user_info = item[1]
        twitter_data['id'].append(each_user_id)
        twitter_data['account_type'].append(each_user_account_type)
        each_dictionary = user_dict[each_user_id]
        
        if 'name' in each_dictionary:
                twitter_data['name'].append(each_dictionary['name'])
        else:
            twitter_data['name'].append('')

        if 'username' in each_dictionary:
            twitter_data['username'].append(each_dictionary['username'])
        else:
            twitter_data['username'].append('')

        if 'created_at' in each_dictionary:
            twitter_data['created_at'].append(each_dictionary['created_at'])
        else:
            twitter_data['created_at'].append('')

        if 'protected' in each_dictionary:
            twitter_data['protected'].append(each_dictionary['protected'])
        else:
            twitter_data['protected'].append('')

        if 'verified' in each_dictionary:
            twitter_data['verified'].append(each_dictionary['verified'])
        else:
            twitter_data['verified'].append('')
        
        if 'url' in each_dictionary:
            twitter_data['url'].append(each_dictionary['url'])
        else:
            twitter_data['url'].append('')
            
        if 'profile_image_url' in each_dictionary:
            twitter_data['profile_image_url'].append(each_dictionary['profile_image_url'])
        else:
            twitter_data['profile_image_url'].append('')

        if 'location' in each_dictionary:
            twitter_data['location'].append(each_dictionary['location'])
        else:
            twitter_data['location'].append('')

        if 'description' in each_dictionary:
            twitter_data['description'].append(each_dictionary['description'])
        else:
            twitter_data['description'].append('')

        if 'public_metrics' in each_dictionary:
            each_pub_metric = each_dictionary['public_metrics']

            if 'followers_count' in each_pub_metric:
                twitter_data['followers_count'].append(each_pub_metric['followers_count'])
            else:
                twitter_data['followers_count'].append('')

            if 'following_count' in each_pub_metric:
                twitter_data['following_count'].append(each_pub_metric['following_count'])
            else:
                twitter_data['following_count'].append('')

            if 'tweet_count' in each_pub_metric:
                twitter_data['tweet_count'].append(each_pub_metric['tweet_count'])
            else:
                twitter_data['tweet_count'].append('')
                
            if 'listed_count' in each_pub_metric:
                twitter_data['listed_count'].append(each_pub_metric['listed_count'])
            else:
                twitter_data['listed_count'].append('')
                
        else:
            twitter_data['followers_count'].append('')
            twitter_data['following_count'].append('')
            twitter_data['tweet_count'].append('')
            twitter_data['listed_count'].append('')

    return twitter_data

In [7]:
bearer_token1 = 'AAAAAAAAAAAAAAAAAAAAADuChwEAAAAAyd5NyoPPZfk%2FiBwmc2mC9me33RA%3DTFH93ScdBzcU6OHVLLsTDHKLW599NhhPoEBTPi0KFWdAEbmFth'
bearer_token2 = 'AAAAAAAAAAAAAAAAAAAAAAK%2FhwEAAAAAtHE4qeLy3hhb660R143d7IOccaE%3D8qaWBsRfyKMigZj2jBVVFNifngriGAkuwJv74wOOs05m6XNReW'
bearer_token3 = 'AAAAAAAAAAAAAAAAAAAAAOjAhwEAAAAAzL7KuhOWC8yybNEnpbXPfhmCpMA%3Ds3R17rqHTA6rqR2kDhis6mrjFWYth3VgLgUu6w354BlwY2hGH8'

bearer_tokens = [bearer_token2, bearer_token1, bearer_token3]

In [8]:
start_index = 0

for each_round in range(1):
    token = bearer_tokens[each_round%3]
    client = tweepy.Client(bearer_token= token)
    print(token)
    
    twitter_data_list = []
    for i in range(69):
        start = i * 100
        end = start + 100
        df_tweet_subset = df_twitter[start: end]
        acc_type_dict = dict(zip(df_tweet_subset["id"], df_tweet_subset["account_type"]))
        each_id_list = list(df_tweet_subset["id"])
        print(str(start) + " : " + str(end)) 
        twitter_data = get_twitter_dictionary(each_id_list, acc_type_dict, client)
        tweets_df = pd.DataFrame(twitter_data)
        twitter_data_list.append(tweets_df)
    
    start_index += 1
    
    combined_twitter_data = pd.concat(twitter_data_list)
        
    combined_twitter_data.to_csv(f'final_combined.csv')
    
    print("round" + str(each_round))

AAAAAAAAAAAAAAAAAAAAAAK%2FhwEAAAAAtHE4qeLy3hhb660R143d7IOccaE%3D8qaWBsRfyKMigZj2jBVVFNifngriGAkuwJv74wOOs05m6XNReW
0 : 100
100 : 200
200 : 300
300 : 400
400 : 500
500 : 600
600 : 700
700 : 800
800 : 900
900 : 1000
1000 : 1100
1100 : 1200
1200 : 1300
1300 : 1400
1400 : 1500
1500 : 1600
1600 : 1700
1700 : 1800
1800 : 1900
1900 : 2000
2000 : 2100
2100 : 2200
2200 : 2300
2300 : 2400
2400 : 2500
2500 : 2600
2600 : 2700
2700 : 2800
2800 : 2900
2900 : 3000
3000 : 3100
3100 : 3200
3200 : 3300
3300 : 3400
3400 : 3500
3500 : 3600
3600 : 3700
3700 : 3800
3800 : 3900
3900 : 4000
4000 : 4100
4100 : 4200
4200 : 4300
4300 : 4400
4400 : 4500
4500 : 4600
4600 : 4700
4700 : 4800
4800 : 4900
4900 : 5000
5000 : 5100
5100 : 5200
5200 : 5300
5300 : 5400
5400 : 5500
5500 : 5600
5600 : 5700
5700 : 5800
5800 : 5900
5900 : 6000
6000 : 6100
6100 : 6200
6200 : 6300
6300 : 6400
6400 : 6500
6500 : 6600
6600 : 6700
6700 : 6800
6800 : 6900
round0


In [9]:
final_combined = combined_twitter_data

In [10]:
final_combined = final_combined.reset_index(inplace = False, drop = True)

In [11]:
final_combined['id'] = final_combined['id'].astype(str)

In [114]:
def has_profile_image(profile_image_url):
    if type(profile_image_url) == str:
        return profile_image_url.find("/default_profile_normal.png") == -1
    return False 

final_combined["has_profile_image"] = final_combined["profile_image_url"].apply(lambda x : has_profile_image(x))
final_combined

Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
0,1502026416,早川援,0918Bask,2013-06-11T11:20:35.000Z,False,False,,https://pbs.twimg.com/profile_images/100439306...,,好きこそ物の上手なれ。,242,250,3921,1,human,True
1,191839658,bio,sweetalkmp3,2010-09-17T14:02:10.000Z,True,False,,https://pbs.twimg.com/profile_images/758761054...,,no longer here I'm @toujoursbeiles,1690,791,255840,55,human,True
2,1947320929,❄McKayla❄,1Dniallprincess,2013-10-08T15:59:30.000Z,False,False,,https://pbs.twimg.com/profile_images/100270690...,Alaska,"Live, Young, Wild and Free ♊️ “Attitude Is Eve...",610,76,13887,8,human,True
3,255846106,Giselle | #SaveMacGyver,1GisellePizarro,2011-02-22T04:37:34.000Z,False,False,https://t.co/XK9kMS2j8g,https://pbs.twimg.com/profile_images/108000539...,"Antofagasta, Chile",29-year-old Bilingual Management student by da...,102,83,94231,8,human,True
4,1733095801,Nicole Romany,1Nicoleromany,2013-09-05T20:52:02.000Z,False,False,,https://pbs.twimg.com/profile_images/128788084...,,Multi Media Journalist/Producer CCNTV6 \nWork ...,1349,382,9999,21,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4236,99561835,Hassam,oficialHASSAM,2009-12-26T19:38:35.000Z,False,True,https://t.co/RF02usHQhV,https://pbs.twimg.com/profile_images/163216059...,Bogota - Colombia,Hassam Contacto: Info@hassam.com.co,1443429,356,56097,1051,human,True
4237,68765193,Gui Santana,oguisantana,2009-08-25T18:20:47.000Z,False,True,,https://pbs.twimg.com/profile_images/144232348...,São Paulo,Humorista,910714,844,8770,519,human,True
4238,134164413,Relatable Feelings,omgrelatabIe,2010-04-17T16:34:38.000Z,False,False,https://t.co/kzr5yPhiwI,https://pbs.twimg.com/profile_images/154806296...,"Frankfurt am Main, Deutschland",Tweeting relatable thoughts & feelings. Just t...,893608,48584,56921,2590,human,True
4239,72378123,Ömür Gedik,omurgedik,2009-09-07T20:49:11.000Z,False,True,https://t.co/cuvHMZqV7X,https://pbs.twimg.com/profile_images/155565314...,Turkey,"@hacikodernek Kurucu Başkanı, POPSAV YK Üyesi,...",830808,104905,31037,977,human,True


In [116]:
final_combined.to_csv('final_combined.csv')

#### getting profile pictures for bots and humans separately for mixed datasets

In [25]:
df2 = final_combined[final_combined["has_profile_image"] == True][["id", "profile_image_url", "account_type"]]
df2_bots = df2[df2['account_type'] == 'bot']
df2_humans = df2[df2['account_type'] == 'human']
df2_bots = df2_bots.reset_index()
df2_humans = df2_humans.reset_index()

In [26]:
df2_bots

Unnamed: 0,index,id,profile_image_url,account_type
0,2148,390617262,https://pbs.twimg.com/profile_images/175289747...,bot
1,2150,1154282738,https://pbs.twimg.com/profile_images/321630323...,bot
2,2152,1357959324,https://pbs.twimg.com/profile_images/153216197...,bot
3,2153,327555341,https://pbs.twimg.com/profile_images/161038836...,bot
4,2154,1459595030,https://pbs.twimg.com/profile_images/101206593...,bot
...,...,...,...,...
874,3165,440784095,https://pbs.twimg.com/profile_images/162257865...,bot
875,3166,297174372,https://pbs.twimg.com/profile_images/113132241...,bot
876,3167,112431151,https://pbs.twimg.com/profile_images/162406150...,bot
877,3168,14614833,https://pbs.twimg.com/profile_images/141964925...,bot


In [27]:
df2_humans

Unnamed: 0,index,id,profile_image_url,account_type
0,0,1502026416,https://pbs.twimg.com/profile_images/100439306...,human
1,1,191839658,https://pbs.twimg.com/profile_images/758761054...,human
2,2,1947320929,https://pbs.twimg.com/profile_images/100270690...,human
3,3,255846106,https://pbs.twimg.com/profile_images/108000539...,human
4,4,1733095801,https://pbs.twimg.com/profile_images/128788084...,human
...,...,...,...,...
3336,4236,99561835,https://pbs.twimg.com/profile_images/163216059...,human
3337,4237,68765193,https://pbs.twimg.com/profile_images/144232348...,human
3338,4238,134164413,https://pbs.twimg.com/profile_images/154806296...,human
3339,4239,72378123,https://pbs.twimg.com/profile_images/155565314...,human


In [93]:
for index, row in df2_bots.iterrows():
    try:
        user_id = row["id"]
        img_url = row["profile_image_url"]
        extension = os.path.splitext(img_url)[-1]
        filename = str(user_id) + extension.lower()
        directory = './pictures/bot/' + filename
        img = Image.open(requests.get(img_url, stream = True).raw).convert('RGB')
        img.save(directory)
    except Exception as e:
        error_msg = str(e) + " " + str(user_id) + " " + str(img_url)
        print(error_msg)

In [66]:
for index, row in df2_humans.iterrows():
    try:
        user_id = row["id"]
        img_url = row["profile_image_url"]
        extension = os.path.splitext(img_url)[-1]
        filename = str(user_id) + extension.lower()
        directory = './pictures/human/' + filename
        img = Image.open(requests.get(img_url, stream = True).raw).convert('RGB')
        img.save(directory)
    except Exception as e:
        error_msg = str(e) + " " + str(user_id) + " " + str(img_url)
        print(error_msg)

unknown file extension:  763095612 https://pbs.twimg.com/profile_images/2514776452/U5hTn1oi_normal


#### scraping more legitamate users data from genuine dataset

In [80]:
df_legit = pd.read_csv('legitimate_users.csv')

In [81]:
df_legit

Unnamed: 0,UserID,CreatedAt,CollectedAt,NumerOfFollowings,NumberOfFollowers,NumberOfTweets,LengthOfScreenName,LengthOfDescriptionInUserProfile
0,614,13/7/06 15:30,20/11/09 23:56,510,350,3265,10,34
1,1038,15/7/06 16:12,16/11/09 5:12,304,443,4405,7,156
2,1437,16/7/06 12:29,16/11/09 16:25,45,73,725,6,37
3,2615,19/7/06 23:23,27/11/09 18:34,211,230,211,7,0
4,3148,26/7/06 14:17,20/11/09 17:35,7346,7244,11438,8,97
...,...,...,...,...,...,...,...,...
19271,93390990,29/11/09 6:34,29/11/09 7:50,5,0,5,11,0
19272,93402679,29/11/09 7:47,29/11/09 7:56,20,1,1,12,0
19273,93419256,29/11/09 9:23,29/11/09 9:30,0,0,1,8,0
19274,93426370,29/11/09 10:04,29/11/09 10:13,20,1,1,10,0


In [82]:
df_legit_new = df_legit.sample(n=15000, random_state=1)
df_legit_new = df_legit_new.reset_index(drop = True)

new_df_legit = df_legit_new['UserID'].to_frame()
new_df_legit['account_type'] = 'human'
new_df_legit = new_df_legit.rename(columns = {'UserID': 'id'})
new_df_legit['id'] = new_df_legit['id'].astype(str)
new_df_legit

Unnamed: 0,id,account_type
0,15855162,human
1,79561384,human
2,19737713,human
3,75675003,human
4,60748828,human
...,...,...
14995,50678874,human
14996,41842039,human
14997,26964352,human
14998,87335770,human


In [83]:
start_index = 0

for each_round in range(1):
    token = bearer_tokens[each_round%3]
    client = tweepy.Client(bearer_token= token)
    print(token)
    
    twitter_data_list = []
    for i in range(150):
        start = i * 100
        end = start + 100
        df_tweet_subset = new_df_legit[start: end]
        acc_type_dict = dict(zip(df_tweet_subset["id"], df_tweet_subset["account_type"]))
        each_id_list = list(df_tweet_subset["id"])
        print(str(start) + " : " + str(end)) 
        twitter_data = get_twitter_dictionary(each_id_list, acc_type_dict, client)
        tweets_df = pd.DataFrame(twitter_data)
        twitter_data_list.append(tweets_df)
    
    start_index += 1
    
    combined_twitter_data = pd.concat(twitter_data_list)
        
    combined_twitter_data.to_csv(f'final_combined_legit.csv')
    
    print("round" + str(each_round))

AAAAAAAAAAAAAAAAAAAAAAK%2FhwEAAAAAtHE4qeLy3hhb660R143d7IOccaE%3D8qaWBsRfyKMigZj2jBVVFNifngriGAkuwJv74wOOs05m6XNReW
0 : 100
100 : 200
200 : 300
300 : 400
400 : 500
500 : 600
600 : 700
700 : 800
800 : 900
900 : 1000
1000 : 1100
1100 : 1200
1200 : 1300
1300 : 1400
1400 : 1500
1500 : 1600
1600 : 1700
1700 : 1800
1800 : 1900
1900 : 2000
2000 : 2100
2100 : 2200
2200 : 2300
2300 : 2400
2400 : 2500
2500 : 2600
2600 : 2700
2700 : 2800
2800 : 2900
2900 : 3000
3000 : 3100
3100 : 3200
3200 : 3300
3300 : 3400
3400 : 3500
3500 : 3600
3600 : 3700
3700 : 3800
3800 : 3900
3900 : 4000
4000 : 4100
4100 : 4200
4200 : 4300
4300 : 4400
4400 : 4500
4500 : 4600
4600 : 4700
4700 : 4800
4800 : 4900
4900 : 5000
5000 : 5100
5100 : 5200
5200 : 5300
5300 : 5400
5400 : 5500
5500 : 5600
5600 : 5700
5700 : 5800
5800 : 5900
5900 : 6000
6000 : 6100
6100 : 6200
6200 : 6300
6300 : 6400
6400 : 6500
6500 : 6600
6600 : 6700
6700 : 6800
6800 : 6900
6900 : 7000
7000 : 7100
7100 : 7200
7200 : 7300
7300 : 7400
7400 : 7500
7500 :

In [84]:
final_combined_legit = combined_twitter_data

In [85]:
final_combined_legit = final_combined_legit.reset_index(inplace = False, drop = True)

In [86]:
final_combined_legit['id'] = final_combined_legit['id'].astype(str)

In [117]:
final_combined_legit["has_profile_image"] = final_combined_legit["profile_image_url"].apply(lambda x : has_profile_image(x))
final_combined_legit

Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
0,19737713,Elizabeth Alraune,JoLoPe,2009-01-29T22:03:40.000Z,False,False,http://t.co/J6DDq5gJ,https://pbs.twimg.com/profile_images/502992423...,everywhere,"I love to read and write, but am not a big fan...",3664,4640,28949,88,human,True
1,60748828,Robin Emilian,sulphurandtea,2009-07-27T23:47:48.000Z,True,False,,https://pbs.twimg.com/profile_images/335181244...,,,10,0,1968,0,human,True
2,92155484,Jasmine Doria,JasDoria,2009-11-24T00:25:18.000Z,True,False,,https://pbs.twimg.com/profile_images/378800000...,HAWAII,imagination is the key to your world of advent...,56,299,1181,0,human,True
3,46499681,🦄 | #BLM ✊🏿,thatsclaudia,2009-06-11T21:43:51.000Z,True,False,,https://pbs.twimg.com/profile_images/162767550...,📍currently in Rome,🌠 26🔸INFP-T🔸\npisces ☼ capricorn ↑ taurus ☽ 🎥 ...,3958,2890,251180,99,human,True
4,15590821,DonnaKat,DonnaKatTzu,2008-07-24T23:24:02.000Z,False,False,,https://pbs.twimg.com/profile_images/113813687...,"Bonne Terre, MO","Pureblood: unmasked, unvaxxed and unafraid.",800,1223,39877,45,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11333,15270176,garybaumgarten,garybaumgarten,2008-06-29T12:16:19.000Z,False,False,https://t.co/z9zZbLFAHe,https://pbs.twimg.com/profile_images/514448560...,New York,Journalist for more than 5 decades. Opinions m...,1633,1010,46322,71,human,True
11334,77316007,Márcio Luis,marcioluis_,2009-09-25T21:31:03.000Z,False,False,http://t.co/3fimtOJirK,https://pbs.twimg.com/profile_images/136084962...,"tourino, feio, muito amigo. 웃",,99,180,171,1,human,True
11335,50678874,PM,pas07,2009-06-25T15:39:54.000Z,False,False,,https://pbs.twimg.com/profile_images/133071869...,Endonesah,No Self-Confidence and Less Self-Control,246,196,41235,1,human,True
11336,41842039,Renan F. F da Rosa,renaanrosaa,2009-05-22T16:10:35.000Z,False,False,,https://pbs.twimg.com/profile_images/277137399...,,"um jovem adorador,gosto muito de musica,fotogr...",369,378,1894,1,human,True


In [118]:
final_combined_legit.to_csv('final_combined_legit.csv')

#### getting profile pictures for bots and humans separately for genuine dataset

In [91]:
df3 = final_combined_legit[final_combined_legit["has_profile_image"] == True]
df3

Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
0,19737713,Elizabeth Alraune,JoLoPe,2009-01-29T22:03:40.000Z,False,False,http://t.co/J6DDq5gJ,https://pbs.twimg.com/profile_images/502992423...,everywhere,"I love to read and write, but am not a big fan...",3664,4640,28949,88,human,True
1,60748828,Robin Emilian,sulphurandtea,2009-07-27T23:47:48.000Z,True,False,,https://pbs.twimg.com/profile_images/335181244...,,,10,0,1968,0,human,True
2,92155484,Jasmine Doria,JasDoria,2009-11-24T00:25:18.000Z,True,False,,https://pbs.twimg.com/profile_images/378800000...,HAWAII,imagination is the key to your world of advent...,56,299,1181,0,human,True
3,46499681,🦄 | #BLM ✊🏿,thatsclaudia,2009-06-11T21:43:51.000Z,True,False,,https://pbs.twimg.com/profile_images/162767550...,📍currently in Rome,🌠 26🔸INFP-T🔸\npisces ☼ capricorn ↑ taurus ☽ 🎥 ...,3958,2890,251180,99,human,True
4,15590821,DonnaKat,DonnaKatTzu,2008-07-24T23:24:02.000Z,False,False,,https://pbs.twimg.com/profile_images/113813687...,"Bonne Terre, MO","Pureblood: unmasked, unvaxxed and unafraid.",800,1223,39877,45,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11333,15270176,garybaumgarten,garybaumgarten,2008-06-29T12:16:19.000Z,False,False,https://t.co/z9zZbLFAHe,https://pbs.twimg.com/profile_images/514448560...,New York,Journalist for more than 5 decades. Opinions m...,1633,1010,46322,71,human,True
11334,77316007,Márcio Luis,marcioluis_,2009-09-25T21:31:03.000Z,False,False,http://t.co/3fimtOJirK,https://pbs.twimg.com/profile_images/136084962...,"tourino, feio, muito amigo. 웃",,99,180,171,1,human,True
11335,50678874,PM,pas07,2009-06-25T15:39:54.000Z,False,False,,https://pbs.twimg.com/profile_images/133071869...,Endonesah,No Self-Confidence and Less Self-Control,246,196,41235,1,human,True
11336,41842039,Renan F. F da Rosa,renaanrosaa,2009-05-22T16:10:35.000Z,False,False,,https://pbs.twimg.com/profile_images/277137399...,,"um jovem adorador,gosto muito de musica,fotogr...",369,378,1894,1,human,True


In [92]:
for index, row in df3.iterrows():
    try:
        user_id = row["id"]
        img_url = row["profile_image_url"]
        extension = os.path.splitext(img_url)[-1]
        filename = str(user_id) + extension.lower()
        directory = './pictures_legit/' + filename
        img = Image.open(requests.get(img_url, stream = True).raw).convert('RGB')
        img.save(directory)
    except Exception as e:
        error_msg = str(e) + " " + str(user_id) + " " + str(img_url)
        print(error_msg)



unknown file extension:  20951082 https://pbs.twimg.com/profile_images/2040396138/M9gCG9PT_normal
unknown file extension:  43120355 https://pbs.twimg.com/profile_images/1872178349/gerF8xGl_normal
unknown file extension:  16454640 https://pbs.twimg.com/profile_images/1134441944/4_normal
unknown file extension:  49552710 https://pbs.twimg.com/profile_images/1620155215/photo_normal
cannot identify image file <_io.BytesIO object at 0x000002406965DD10> 37628665 https://pbs.twimg.com/profile_images/195820525/favicon_normal.gif
unknown file extension:  7919102 https://pbs.twimg.com/profile_images/2402664050/342788F9-7C90-41C8-AECF-CEB4F6A1527C_normal
unknown file extension:  14710622 https://pbs.twimg.com/profile_images/1701701639/image_normal
unknown file extension:  80208917 https://pbs.twimg.com/profile_images/1240599527/C2632j6z_normal
unknown file extension:  46579801 https://pbs.twimg.com/profile_images/1862657277/gqOvL4Q5_normal
unknown file extension:  42812912 https://pbs.twimg.com/p

#### combining mixed dataset, genuine dataset and bots dataset to form the final combined dataset

In [139]:
df_bots = pd.read_csv('bots_scraped.csv')
df_humans = pd.read_csv('final_combined_legit.csv')
df_mix = pd.read_csv('final_combined.csv')

In [140]:
df_bots['id'] = df_bots['id'].astype(str)
combined_dataset = pd.concat([df_mix, df_humans, df_bots])
combined_dataset = combined_dataset.reset_index(drop = True)
combined_dataset = combined_dataset.drop(columns=combined_dataset.columns[0])
combined_dataset

Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
0,1502026416,早川援,0918Bask,2013-06-11T11:20:35.000Z,False,False,,https://pbs.twimg.com/profile_images/100439306...,,好きこそ物の上手なれ。,242,250,3921,1,human,True
1,191839658,bio,sweetalkmp3,2010-09-17T14:02:10.000Z,True,False,,https://pbs.twimg.com/profile_images/758761054...,,no longer here I'm @toujoursbeiles,1690,791,255840,55,human,True
2,1947320929,❄McKayla❄,1Dniallprincess,2013-10-08T15:59:30.000Z,False,False,,https://pbs.twimg.com/profile_images/100270690...,Alaska,"Live, Young, Wild and Free ♊️ “Attitude Is Eve...",610,76,13887,8,human,True
3,255846106,Giselle | #SaveMacGyver,1GisellePizarro,2011-02-22T04:37:34.000Z,False,False,https://t.co/XK9kMS2j8g,https://pbs.twimg.com/profile_images/108000539...,"Antofagasta, Chile",29-year-old Bilingual Management student by da...,102,83,94231,8,human,True
4,1733095801,Nicole Romany,1Nicoleromany,2013-09-05T20:52:02.000Z,False,False,,https://pbs.twimg.com/profile_images/128788084...,,Multi Media Journalist/Producer CCNTV6 \nWork ...,1349,382,9999,21,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20289,2396095423,Aldridge Biz Opp,AldridgeBizOpp,2014-03-18T11:52:26.000Z,False,False,,https://pbs.twimg.com/profile_images/450760788...,North America,We provide Business Resources and Business Con...,7,20,735,0,1,True
20290,2396099064,DL Business Opp,DLBusinessOpp,2014-03-18T12:03:23.000Z,False,False,,https://pbs.twimg.com/profile_images/450782110...,Growing North America,We provide Business Resources and Business Con...,102,387,2522,6,1,True
20291,2429375022,LF Career Options,LFCareerOptions,2014-04-05T19:47:10.000Z,False,False,,https://pbs.twimg.com/profile_images/452536953...,Growing North America,"We help sharp, motivated individuals embrace f...",43,101,1302,0,1,True
20292,2429405521,Rick Chou,WestCoastRickC,2014-04-05T20:07:04.000Z,False,False,,https://pbs.twimg.com/profile_images/158212733...,"Vancouver, BC",Passionate about delivering Customer Experienc...,790,725,1734,8,1,True


In [141]:
combined_dataset_final = shuffle(combined_dataset)
combined_dataset_final = combined_dataset_final.reset_index(drop = True)
combined_dataset_final

Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
0,18042464,[日向の乙女]ミルミクス,MILMIX,2008-12-11T06:18:59.000Z,False,False,https://t.co/AoobuYGh5y,https://pbs.twimg.com/profile_images/788469686...,日本,テイルズオブシリーズが好きだったり。モバマス貧弱一般人。Splatoon中毒患者。最近はFN...,1959,2708,405158,157,human,True
1,60286377,Tahnee Trotter,tahneetrotter,2009-07-26T11:25:30.000Z,False,False,http://t.co/P7D9W9j5nT,https://pbs.twimg.com/profile_images/488235175...,,Get frucked,371,0,33172,10,human,True
2,373791732,Kassandra Garcia,Sassy_Kassy37,2011-09-15T05:49:37.000Z,False,False,,https://pbs.twimg.com/profile_images/290477377...,,"If you have Motivatoin, Dedication and Faith i...",36,0,548,0,1,True
3,2374895658,TonieSabella,TonieSabella,2014-03-06T06:01:31.000Z,False,False,http://t.co/yakUEMw5xd,https://pbs.twimg.com/profile_images/473894199...,Bulgaria,Ambient/Post Rock music created by Ben Leopard.,30,148,93,1,1,True
4,17537004,Mihai Todor 🇺🇦,MihaiTodor,2008-11-21T14:28:51.000Z,False,False,https://t.co/Ab2jd2vhvV,https://pbs.twimg.com/profile_images/781079151...,"Dublin, Ireland",Principal Software Engineer interested in comp...,1089,2283,9098,34,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20289,15367428,Paul Urmston,paulurmston,2008-07-09T16:38:23.000Z,False,False,,https://pbs.twimg.com/profile_images/161391553...,United Kingdom,,72,76,4161,1,human,True
20290,17006395,Laura Zambelli,laura_trouble,2008-10-27T20:27:57.000Z,False,False,https://t.co/lFTff06wEg,https://pbs.twimg.com/profile_images/126498263...,,You heard that i was trouble but you couldn't ...,314,107,20915,9,human,True
20291,22130752,The Stag Company,TheStagCompany,2009-02-27T11:31:33.000Z,False,False,http://t.co/GiEeJEGx48,https://pbs.twimg.com/profile_images/793413210...,"Brighton, UK","#Stagweekends, #stagdos and #stagnights. We do...",2981,2523,4991,46,human,True
20292,15166188,kelsey lee 🖤,kelseyofzen,2008-06-19T04:57:53.000Z,False,False,,https://pbs.twimg.com/profile_images/147715462...,,cap ou pas cap?,170,180,7820,4,human,True


In [142]:
combined_dataset_final['account_type'] = combined_dataset_final['account_type'].replace(1, 'bot')

In [143]:
combined_dataset_final

Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
0,18042464,[日向の乙女]ミルミクス,MILMIX,2008-12-11T06:18:59.000Z,False,False,https://t.co/AoobuYGh5y,https://pbs.twimg.com/profile_images/788469686...,日本,テイルズオブシリーズが好きだったり。モバマス貧弱一般人。Splatoon中毒患者。最近はFN...,1959,2708,405158,157,human,True
1,60286377,Tahnee Trotter,tahneetrotter,2009-07-26T11:25:30.000Z,False,False,http://t.co/P7D9W9j5nT,https://pbs.twimg.com/profile_images/488235175...,,Get frucked,371,0,33172,10,human,True
2,373791732,Kassandra Garcia,Sassy_Kassy37,2011-09-15T05:49:37.000Z,False,False,,https://pbs.twimg.com/profile_images/290477377...,,"If you have Motivatoin, Dedication and Faith i...",36,0,548,0,bot,True
3,2374895658,TonieSabella,TonieSabella,2014-03-06T06:01:31.000Z,False,False,http://t.co/yakUEMw5xd,https://pbs.twimg.com/profile_images/473894199...,Bulgaria,Ambient/Post Rock music created by Ben Leopard.,30,148,93,1,bot,True
4,17537004,Mihai Todor 🇺🇦,MihaiTodor,2008-11-21T14:28:51.000Z,False,False,https://t.co/Ab2jd2vhvV,https://pbs.twimg.com/profile_images/781079151...,"Dublin, Ireland",Principal Software Engineer interested in comp...,1089,2283,9098,34,human,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20289,15367428,Paul Urmston,paulurmston,2008-07-09T16:38:23.000Z,False,False,,https://pbs.twimg.com/profile_images/161391553...,United Kingdom,,72,76,4161,1,human,True
20290,17006395,Laura Zambelli,laura_trouble,2008-10-27T20:27:57.000Z,False,False,https://t.co/lFTff06wEg,https://pbs.twimg.com/profile_images/126498263...,,You heard that i was trouble but you couldn't ...,314,107,20915,9,human,True
20291,22130752,The Stag Company,TheStagCompany,2009-02-27T11:31:33.000Z,False,False,http://t.co/GiEeJEGx48,https://pbs.twimg.com/profile_images/793413210...,"Brighton, UK","#Stagweekends, #stagdos and #stagnights. We do...",2981,2523,4991,46,human,True
20292,15166188,kelsey lee 🖤,kelseyofzen,2008-06-19T04:57:53.000Z,False,False,,https://pbs.twimg.com/profile_images/147715462...,,cap ou pas cap?,170,180,7820,4,human,True


In [150]:
combined_dataset_final[combined_dataset_final['id'].duplicated() == True]

Unnamed: 0,id,name,username,created_at,protected,verified,url,profile_image_url,location,description,followers_count,following_count,tweet_count,listed_count,account_type,has_profile_image
9760,39927901,Nadeem Malik 🇵🇰,nadeemmalik,2009-05-14T04:33:26.000Z,False,True,https://t.co/MsPUgekaPt,https://pbs.twimg.com/profile_images/119097516...,ISLAMABAD,A Flagship Current Affairs Programme 🇵🇰 Team #...,3138402,278,8255,1491,human,True
12372,43563720,Boity Thulo,Boity,2009-05-30T17:49:02.000Z,False,True,https://t.co/rXxwHhxIao,https://pbs.twimg.com/profile_images/155437637...,South Africa,BOOKINGS: sibo@img-africa.co.za @bt_signature ...,3841139,2733,132104,1648,human,True
14599,17890482,Denny Cherry - mrdenny@techhub.social,mrdenny,2008-12-05T04:47:43.000Z,False,True,https://t.co/I7nMJ7vH3p,https://pbs.twimg.com/profile_images/534778558...,"Oceanside, CA","CEO, Microsoft MVP, VMware vExpert, #Azure, #A...",10105,1119,89307,494,human,True
15886,9609372,The Courier-Mail,couriermail,2007-10-22T23:36:14.000Z,False,True,http://t.co/dWMmXIw1tS,https://pbs.twimg.com/profile_images/846231081...,"Queensland, Australia",Qld's best news source 🌞 Proudly Maroon 💪🏼 \nS...,156635,991,185128,1470,bot,True
19085,21253156,Katie Shell,Katieshelll,2009-02-18T23:07:39.000Z,False,False,,https://pbs.twimg.com/profile_images/465262798...,Southern California,Expeditor/Receptionist and avid book reader. A...,103,195,4624,4,human,True


In [152]:
combined_dataset_final = combined_dataset_final.drop_duplicates(subset=['id'], keep='first')

In [156]:
len(combined_dataset_final['id'].unique())

20289

In [153]:
combined_dataset_final.to_csv('final_combined_dataset.csv')