# <div style="color:#fff;display:fill;border-radius:10px;background-color:#3c968b;text-align:left;letter-spacing:0.1px;overflow:hidden;padding:20px;color:white;overflow:hidden;margin:0;font-size:100%"> Google tweets - data collection</div>



In [1]:
!pip install tweepy --quiet

[0m

In [2]:
import os
import numpy as np
import tweepy as tw
import pandas as pd
import json
import datetime as dt
from datetime import datetime, timedelta
#import wandb
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import shutil

In [6]:
from kaggle_secrets import UserSecretsClient
user_secrets = UserSecretsClient()

BEARER_TOKEN = user_secrets.get_secret("BEARER_TOKEN")

In [7]:
client = tw.Client(bearer_token=BEARER_TOKEN)

# -is:retweet means I don"t want retweets
# lang:en is asking for the tweets to be in english
query = '("google" OR "@google" OR "Google") -search -is:retweet lang:en'

def search_tweets(query=query, start_time=None, end_time=None, max_results=100):
    tweets = client.search_recent_tweets(query=query, 
                                         start_time=start_time,
                                         end_time=end_time,
                                         tweet_fields=[
                                             "context_annotations", 
                                             "created_at",
                                             "author_id",
                                             "public_metrics",
                                             "referenced_tweets",
                                             "possibly_sensitive",
                                             "source",
                                             "lang",
                                             "entities",
                                         ], 
                                         user_fields=[
                                             "id",
                                             "name",
                                             "username",
                                             "created_at",
                                             "description",
                                             "location",
                                             "public_metrics",
                                             "verified",
                                             "profile_image_url"
                                         ],
                                         expansions="author_id",
                                         max_results=max_results)
    
    return tweets


tweets = search_tweets(max_results=10)


# What context_annotations are: 
# https://developer.twitter.com/en/docs/twitter-api/annotations/overview
# (not using them for now)

for tweet in tweets.data[:1]:
    print(tweet.text)
    if len(tweet.context_annotations) > 0:
        print(tweet.context_annotations)

TwitterServerError: 503 Service Unavailable
Service Unavailable

In [None]:
print(tweets.includes["users"][0].id)
print(tweets.includes["users"][0].created_at)
print(tweets.includes["users"][0].username)
print(tweets.includes["users"][0].name)
print(tweets.includes["users"][0].location)
print(tweets.includes["users"][0].verified)
print(tweets.includes["users"][0].public_metrics)
print(tweets.includes["users"][0].profile_image_url)

In [None]:
tweets.data[0].public_metrics

In [None]:
tweets.data[0].entities

In [None]:
dt_today = datetime.combine(datetime.today(), dt.time(0, 0, 0))
print(f"We will collect up to 100 x 24 tweets \nFrom:\t{dt_today - timedelta(days=1, hours=24)} \nTo:\t{dt_today - timedelta(days=1, hours=0)}")

In [None]:
# Defining the extraction pipeline, it will go through each hour from the day before the previous day and will extract up to 100 tweets. 
# Then they are filtered and loaded into a temp DataFrame in order to only pick 1000 of them at the end randomly.

def build_raw_dataframe(days_ago=1):
    tweets_df = pd.DataFrame()
    
    # Extracting 200 tweets for each hour between 24 and 48 hours ago
    for h in range(24, 0, -1):
        start_time = dt_today - timedelta(days=days_ago, hours=h)
        end_time = dt_today - timedelta(days=days_ago, hours=h-1)
        
        hour_tweets = search_tweets(start_time=start_time, end_time=end_time, max_results=100)
        curr_dt = datetime.now()
        
        for tweet, user in zip(hour_tweets.data, hour_tweets.includes["users"]):

            # Keeping away from sensitive content :)
            if tweet.possibly_sensitive:
                continue
                
            # Exclude users with default profile images
            if user.profile_image_url.endswith('default_profile_normal.png'):
                continue
                
            if tweet.entities:
                # Check content of the tweet
                max_hashtags = 5
                max_mentions = 5
                max_urls = 3
                
                hashtags = tweet.entities.get('hashtags') or []
                mentions = tweet.entities.get('mentions') or []
                urls = tweet.entities.get('urls') or []

                if (len(hashtags) > max_hashtags or len(mentions) > max_mentions or len(urls) > max_urls):
                    continue
                    
                if hashtags:
                    hashtags = [hashtag["tag"] for hashtag in hashtags]
                else:
                    hashtags = []

            tweets_df = tweets_df.append(pd.DataFrame({"tweet_id": tweet.id,
                                                       "tweet_created": tweet.created_at,
                                                       "tweet_extracted": curr_dt,
                                                       "text": tweet.text,
                                                       "lang": tweet.lang,
                                                       "user_id": user.id,
                                                       "user_name": user.name, 
                                                       "user_username": user.username,
                                                       "user_location": user.location,
                                                       "user_description": user.description,
                                                       "user_created": user.created_at,
                                                       "user_followers_count": user.public_metrics["followers_count"],
                                                       "user_following_count": user.public_metrics["following_count"],
                                                       "user_tweet_count": user.public_metrics["tweet_count"],
                                                       "user_verified": user.verified,
                                                       #"hashtags": hashtags if tweet.entities else [],
                                                       "source": tweet.source,
                                                       "retweet_count": tweet.public_metrics["retweet_count"],
                                                       "like_count": tweet.public_metrics["like_count"],
                                                       "reply_count": tweet.public_metrics["reply_count"],
                                                       "impression_count": tweet.public_metrics["impression_count"],
                                                       #"is_retweet": tweet.retweeted,
                                                      }, 
                                                        index=[0])
                                    )
    return tweets_df

In [None]:
last_day_df = build_raw_dataframe()
n_sample = 200 if len(last_day_df) > 200 else len(last_day_df)
last_day_df = last_day_df.sample(n=n_sample)

print(last_day_df.shape)
last_day_df.head()

In [None]:
last_day_df.to_csv("1000google_daily tweets.csv")

In [None]:
input_file_path = "/kaggle/input/day4-google-data/data_3_days.csv"
output_file_path = "/kaggle/working/daily tweets.csv"

# Loading previous dataset
old_days_df = pd.read_csv(input_file_path)

# Concatenating all previous tweets with the new ones from the last day
all_days_df = pd.concat([old_days_df, last_day_df], axis=0)

# Sanity check
all_days_df = all_days_df[all_days_df.tweet_id.notna()]

# This will be the new file updated into the dataset
all_days_df.to_csv(output_file_path, index=False)

# Copy the output file to a new location
#shutil.copy2(output_file_path, input_file_path)

In [None]:
all_days_df.info()

In [None]:
all_days_df.shape

In [None]:
pd.to_datetime(all_days_df.tweet_created, utc=True).min(), pd.to_datetime(all_days_df.tweet_created, utc=True).max()