# Dataset: Points of Interests Twitter

In [25]:
import pandas as pd
import numpy as np
import tweepy
import json
from tqdm.notebook import tqdm
import time

In [26]:
twitter_credentials = []
with open('../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)
    
# TwitterAPI Authentication
auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'], twitter_credentials['access_token_secret'])
API = tweepy.API(auth)

# Dataset

In [2]:
df = pd.read_csv('poi-twitter-category.csv')
df.head()

Unnamed: 0,ID,Twitter,Category
0,4b474a16f964a520382e26e3,starbucks_gr,Coffee Shop
1,4b474ac9f964a520472e26e3,starbucks_gr,Coffee Shop
2,4b4b19baf964a5204a9226e3,tgifridaysgr,American Restaurant
3,4b659a3ff964a52006f62ae3,teloglion,Art Museum
4,4b6d4b66f964a520bf6f2ce3,paradosiakonet,Snack Place


In [3]:
print(f'Entries: {len(df)}')
print(f'Unique Categories: {len(df.Category.unique())}')

Entries: 1544
Unique Categories: 219


In [11]:
dublicates = df.pivot_table(index = 'Twitter', aggfunc ='size')
dublicates = dublicates[dublicates >= 2].sort_values(ascending = False).to_frame()
dublicates.columns = ['Count']
print(f'Total Number of dublicates: {dublicates.Count.sum()}')
dublicates.head(5)

Total Number of dublicates: 843


Unnamed: 0_level_0,Count
Twitter,Unnamed: 1_level_1
bp_plc,76
alpha_bank,54
vodafone_gr,48
eurobank_group,43
windhellas,37


In [15]:
df = df.drop_duplicates(subset="Twitter")
df = df.reset_index()
df = df.drop('index', axis = 1)

Entries: 819
Unique Categories: 208


In [16]:
category_count = (df.pivot_table(index = 'Category', aggfunc ='size')
                  .sort_values(ascending = False)
                  .to_frame())
category_count.columns = ['Count']
print(category_count.to_string())

                                 Count
Category                              
Hotel                              121
Office                              55
Café                                23
Clothing Store                      22
Resort                              21
Electronics Store                   20
Furniture / Home Store              15
Greek Restaurant                    15
Vacation Rental                     13
Tech Startup                        13
Bed & Breakfast                     13
Pharmacy                            12
Beach Bar                            9
Jewelry Store                        9
Gym / Fitness Center                 8
Coffee Shop                          8
Cocktail Bar                         8
Rental Car Location                  8
Salon / Barbershop                   7
Building                             7
School                               7
Factory                              7
Advertising Agency                   7
IT Services              

In [34]:
failed = []
results = []

for account in tqdm(df['Twitter']):
    try:
        user = API.get_user(account)
        results.append(
            [user.screen_name, user.name, user.description ,user.statuses_count, 
             user.friends_count, user.followers_count, user.protected])
    except tweepy.RateLimitError as err:
        print('Rate Limit Hit. Wait 15 min.')
        time.sleep(60*15)
        
    except Exception as err:
        print(f'Errror for {account}: {err}')
        failed.append(account)
        results.append(['','','',0,0,0,'None'])

  0%|          | 0/819 [00:00<?, ?it/s]

Errror for plaisiocomp: [{'code': 50, 'message': 'User not found.'}]
Errror for pull_and_bear: [{'code': 50, 'message': 'User not found.'}]
Errror for zithosgre: [{'code': 50, 'message': 'User not found.'}]
Errror for rhodeshotels: [{'code': 50, 'message': 'User not found.'}]
Errror for katikieshotels: [{'code': 50, 'message': 'User not found.'}]
Errror for the_ixian_grand: [{'code': 50, 'message': 'User not found.'}]
Errror for ikosoceania: [{'code': 50, 'message': 'User not found.'}]
Errror for amathus_hotels: [{'code': 50, 'message': 'User not found.'}]
Errror for konstadinosh: [{'code': 50, 'message': 'User not found.'}]
Errror for metropolispharm: [{'code': 50, 'message': 'User not found.'}]
Errror for astrasuites: [{'code': 50, 'message': 'User not found.'}]
Errror for qcsthessaloniki: [{'code': 50, 'message': 'User not found.'}]
Errror for massalia2010: [{'code': 50, 'message': 'User not found.'}]
Errror for kastelliresort: [{'code': 50, 'message': 'User not found.'}]
Errror for

In [68]:
dataset = pd.DataFrame(results)
dataset.columns = ['screen_name', 'name', 'description', 'statuses_count', 'friends_count', 'followers_count', 'protected']
dataset.head()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count,protected
0,Starbucks_Gr,Starbucks Greece,Να εμπνέουμε και να καλλιεργούμε το ανθρώπινο ...,10,1,379,False
1,TGIFridaysGR,TGI Fridays Greece,Καταπληκτικά φαγητά και ποτά σε μια αληθινά φι...,5192,1290,3318,False
2,teloglion,Τελλόγλειο Ίδρυμα,Το Τελλόγλειο Ίδρυμα Τεχνών Α.Π.Θ. ιδρύθηκε το...,1631,365,2374,False
3,Paradosiakonet,Παραδοσιακό,,617,5,24,False
4,medpalace,Mediterranean Palace,A cozy 5 star hotel in the city center with an...,269,543,381,False


In [69]:
dataset['category'] = df['Category']
dataset['id'] = df['ID']

In [70]:
dataset = dataset[dataset['screen_name'] != '']

In [71]:
dataset = dataset[dataset['protected'] == False]

In [73]:
print(f'Entries: {len(dataset)}')
print(f'Unique Categories: {len(dataset.category.unique())}')

Entries: 712
Unique Categories: 194


In [75]:
dataset = dataset[['id', 'screen_name', 'category']]

In [79]:
dataset.to_csv('poi-twitter-category-processed.csv', index=False)