# Greek Hotels Classification Dataset

In [3]:
import pandas as pd
import numpy as np
import tweepy
import json
from tqdm.notebook import tqdm
import time

In [72]:
twitter_credentials = []
with open('../../../twitter_credentials.json', 'r') as f:
    twitter_credentials = json.load(f)
    
# TwitterAPI Authentication
auth = tweepy.OAuthHandler(twitter_credentials['consumer_key'], twitter_credentials['consumer_secret'])
auth.set_access_token(twitter_credentials['access_token_key'], twitter_credentials['access_token_secret'])
API = tweepy.API(auth, wait_on_rate_limit=True, wait_on_rate_limit_notify=True)

# Step 1
---
Load foursquare poi twitter dataset and label hotels/resorts with 1 and rest with 0

In [45]:
df = pd.read_csv('../../Poi-twitter/poi-twitter-category-processed.csv')
df.head()

Unnamed: 0,id,screen_name,category
0,4b474a16f964a520382e26e3,Starbucks_Gr,Coffee Shop
1,4b4b19baf964a5204a9226e3,TGIFridaysGR,American Restaurant
2,4b659a3ff964a52006f62ae3,teloglion,Art Museum
3,4b6d4b66f964a520bf6f2ce3,Paradosiakonet,Snack Place
4,4b6de067f964a520f1972ce3,medpalace,Hotel


In [46]:
df1 = df[(df['category'] == 'Hotel') | (df['category'] == 'Resort')].copy()
df2 = df[(df['category'] != 'Hotel') & (df['category'] != 'Resort')].copy()

In [47]:
len(df1), len(df2)

(125, 587)

In [48]:
df1['hotel'] = [1 for i in range(len(df1))]
df2['hotel'] = [0 for i in range(len(df2))]

In [49]:
df1 = df1.drop('category', axis =1)
df2 = df2.drop('category', axis =1)

# Step 2
---
Balance the dataset

In [51]:
df2 = df2.iloc[:125]

In [52]:
df1 = df1.append(df2)
df1 = df1.reset_index().drop('index', axis= 1)

In [53]:
len(df1)

250

# Step 3
---
Collect the data

In [73]:
# Function For Fetching Tweets
def fetch_tweets(data):
    
    recent_100_tweets = []
    

    for screen_name in tqdm(data):
        tweet100 = str()
        count = 0    
        try:
            for status in tweepy.Cursor(API.user_timeline, screen_name=screen_name).items(100):
                tweet100 = tweet100 + ' ' + status.text                
                count+= 1
            
        except tweepy.RateLimitError as err:
            print('Rate Limit Hit. Wait 15min.')
            time.sleep(60*15)
            for status in tweepy.Cursor(API.user_timeline, screen_name=screen_name, extended=True).items(100):
                tweet100 = tweet100 + ' ' + status.text
        except Exception as err:
            print(err)  
        recent_100_tweets.append(tweet100)
            
    return (recent_100_tweets)

In [60]:
failed = []
results = []

for account in tqdm(df1['screen_name']):
    try:
        user = API.get_user(account)
        results.append(
            [user.screen_name, user.name, user.description ,user.statuses_count, 
             user.friends_count, user.followers_count])
    except tweepy.RateLimitError as err:
        print('Rate Limit Hit. Wait 15 min.')
        time.sleep(60*15)
        
    except Exception as err:
        print(f'Errror for {account}: {err}')
        failed.append(account)

  0%|          | 0/250 [00:00<?, ?it/s]

In [61]:
dataset = pd.DataFrame(results)
dataset.columns = ['screen_name', 'name', 'description', 'statuses_count', 'friends_count', 'followers_count']
dataset.head()

Unnamed: 0,screen_name,name,description,statuses_count,friends_count,followers_count
0,medpalace,Mediterranean Palace,A cozy 5 star hotel in the city center with an...,269,543,381
1,RodosPalace,Rodos Palace | abav²,Rodos Palace is regarded as the finest deluxe ...,1169,615,577
2,Cactus_Rhodes,Cactus Hotel,,719,14,40
3,SantoriniVillas,Villas & Mansions,Private Villas & Mansions in Santorini offerin...,114,473,189
4,SunRocksHotel,Sun Rocks Hotel,An enchanting romantic couples' getaway in the...,252,94,337


In [74]:
tweets = fetch_tweets(dataset['screen_name'])

  0%|          | 0/250 [00:00<?, ?it/s]

Rate limit reached. Sleeping for: 90
Rate limit reached. Sleeping for: 348


In [65]:
len(tweets)

250

In [76]:
dataset['recent_100_statuses'] = tweets

In [85]:
dataset['hotel'] = df1['hotel']

In [86]:
dataset.to_csv('hotels-classification.csv', index=False)

In [79]:
from sklearn.model_selection import train_test_split

In [137]:
train, test, _, _ = train_test_split(dataset, dataset['hotel'],test_size=0.2, random_state=200, stratify=dataset['hotel'])

In [145]:
train.to_csv('hotels-training-set.csv', index=False)

In [146]:
test.to_csv('hotels-validation-set.csv', index=False)