In [3]:

import urlexpander
import tweepy
import requests, json
import re
import configure  as cf
import helper.Utils as Utils


def expand_url_using_requests(url):
    try:
        session = requests.Session()  # so connections are recycled
        resp = session.head(url, allow_redirects=True,timeout=10)
        return resp.url
    except:
        return ""

def expand_url(shortened_url):
    # shortened_url = shortened_url.replace(' ','')
    CLIENT_ERROR = "__CLIENT_ERROR__"
    CONNECTIONPOOL_ERROR = "__CONNECTIONPOOL_ERROR__"
    expanded_url = ""
    try:
        expanded_url = urlexpander.expand(shortened_url) # expand url using urlexpander library

        if CLIENT_ERROR in expanded_url:
            expanded_url = expand_url_using_requests(shortened_url)     # expand url using custom function. Another way if the first didn't work
            if CLIENT_ERROR in expanded_url:
                print("Client error while expanding url: ", shortened_url)
                expanded_url = ""

        if CONNECTIONPOOL_ERROR in expanded_url:
            expanded_url = expand_url_using_requests(shortened_url)     # expand url using custom function. Another way if the first didn't work
            if CONNECTIONPOOL_ERROR in expanded_url:
                print("CONNECTION POOL error while expanding url: ", shortened_url)
                expanded_url = ""

    except:
        print("Cannot expand this url ", shortened_url)
        expanded_url = ""
    return expanded_url

def get_webpage_title(expanded_url):
    
    PAGE_404 = "page 404"
    PAGE_NOT_FOUND = "Page not found"
    title = ""
    try:
        meta = urlexpander.html_utils.get_webpage_meta(expanded_url)
        title = meta["title"]

        if PAGE_NOT_FOUND.lower() in title.lower() or PAGE_404 in title:
            print("Page not found for this url: ", expanded_url)
            title = ""
    except:
        title = ""
    return title


def get_api():
    consumer_key ="XXXXXXXXX"
    consumer_secret = "XXXXXXXXX"
    access_token = "XXXXXXXXX-XXXXXXXXX"
    access_token_secret = "XXXXXXXXX"

    auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_token_secret)

    # perform authentication and get api object
    api = tweepy.API(auth)
    return api

def get_username_out_of_handle(api, user_handle):
    user_name = ""
    DONALD_TRUMP_HANDLE = "realdonaldtrump"
    DONALD_TRUMP_USERNAME = "Donald Trump"
    try:
        # clean the handle
        user_handle = user_handle.lower()
        user_handle = user_handle.replace('@','')
        user_handle = user_handle.replace(' ','')

        if DONALD_TRUMP_HANDLE == user_handle:
            return DONALD_TRUMP_USERNAME
            
        user = api.get_user(user_handle)
        user_id = user.id
        user_name = user.name
    except:
        user_name = ""
        print("Error in getting user name for this handle : ", user_handle)

    return user_name 




In [4]:
def get_image_best_guest_from_url(image_url):

    try:
        server_url = "http://localhost:5000/search"
        data = {"image_url": image_url}
        headers = {'Content-type': 'application/json'}
        response = requests.post(server_url, headers=headers, data=json.dumps(data))
        json_response = response.json()
        best_guess = json_response['best_guess']
        if best_guess == "language" or best_guess == "event":
        # if the best guess is only one of these words, then no need to add them to the query
            best_guess = "" 
    except:
        print("Error: cannot get the best guess for this image url: ", image_url)
        best_guess = ""

    return best_guess

def get_image_best_guess_from_tweet_id(api, tweet_id):
    try:
        tweet = api.get_status(tweet_id, tweet_mode="extended")
    except:
        print("Error: No data available for specified ID ", tweet_id)
        return ""
        
    image_best_guess= ""

    if 'media' in tweet.entities:
        for media in tweet.extended_entities['media']:
            try:
                image_url = media['media_url']
                image_best_guess = image_best_guess + get_image_best_guest_from_url(image_url) + " "
            except:
                print("Error: Unable to extract best guess for this tweet : ", tweet_id)

    return image_best_guess

In [5]:

def reformat_urls(tweet):
    ''' Separate consecutive URLs with spaces, and add https prior to pic.twitter'''
    tweet = tweet.replace("https", " https") 
    tweet = tweet.replace("http", " http") 
    tweet = tweet.replace("pic.twitter", " https://pic.twitter") 
    return tweet

def remove_handle_from_second_part(tweet):
    try:
        if '—' in tweet:
            second_part = tweet.split('—')[1]
            new_second_part = re.sub(r"@[\w]*", " ", second_part)  # remove handles
            tweet = tweet.replace(second_part, new_second_part) 
    except:
        print("Cannot remove handle from second part for this tweet: ", tweet)
    return tweet


def get_tweet_id(tweet_url):
    try:
        # 1. get everything after 'status/' 2. Remove everything after '?' 3. Get the id before the first '/'
        tweet_id = tweet_url.split('status/')[-1].split('?')[0].split('/')[0] 
    except:
        print("Error: cannot get the id out of this url: ", tweet_url)
        tweet_id = ""
    return tweet_id

def get_information_out_of_URLs(tweet):

    api = get_api()
    TWITTER_URL = "https://twitter.com"
    tweet_image_shortened_urls = []
    tweet_image_best_guesses = []

    # 1. Format URLs in a readable way and remove handle from second part. Username for the handle in the second part is already exist
    tweet = reformat_urls(tweet)
    tweet = remove_handle_from_second_part(tweet)

    # 2. Replace handles with their names 
    handle_pattern = re.compile(r"@[\w]*")
    iterator = handle_pattern.finditer(tweet)
    for match in iterator:
        user_handle = match.group() # group: Return the string matched by the RE
        user_name = get_username_out_of_handle(api, user_handle)
        tweet = tweet.replace(user_handle, user_name)

    # 3. Replace twitter picture with their best guess, and webpages urls with their title
    url_pattern = re.compile(r"http\S+") # find every url 
    iterator = url_pattern.finditer(tweet)
    for match in iterator:
        shortened_url = match.group() # group: Return the string matched by the RE
        expanded_url = expand_url(shortened_url)

        if TWITTER_URL in expanded_url: # then it is a tweet URL, we need to get its id
            print("Getting image info for this url : ", expanded_url)
            tweet_id = get_tweet_id(expanded_url)
            # get the image url and do reverse image search
            image_best_guess = get_image_best_guess_from_tweet_id(api, tweet_id)

            tweet_image_shortened_urls.append(shortened_url)
            tweet_image_best_guesses.append(image_best_guess)
            
            if image_best_guess != "":
                image_best_guess = ", "+ image_best_guess + ","

            tweet = tweet.replace(shortened_url, image_best_guess)

        else:  # it is an URL to a webpage. Fetch the webpage title and replace it with its url
            
            webpage_title = get_webpage_title(expanded_url)
            webpage_title = re.sub(r'\W*$', '', webpage_title) # remove punctuation from the tail
            tweet_first_part = tweet.split("—")[0]
            tweeet_first_part_without_url = tweet_first_part.replace(shortened_url, "").strip()
            
            # # if tweet first part is in webpage title,
            # if tweeet_first_part_without_url in webpage_title:
            #     tweet = tweet.replace(tweet_first_part, webpage_title)
            # else:
            # if the webpage title is part of the tweet, then no need to add it,
            if webpage_title in tweet:
                webpage_title = ""
            tweet = tweet.replace(shortened_url, webpage_title)
    
    return tweet, tweet_image_shortened_urls, tweet_image_best_guesses


def clean_queries_from_urls(query_path, output_save_path):
    print("---------------------------------------- Extracting information from tweet URLs using the query path: ", query_path)
    print(" ---------------------------------------------------------------------------------------------------------")
    df_query = Utils.read_file(query_path)
    cleaned_tweets = []
    all_tweet_image_shortened_urls = []
    all_tweet_image_best_guesses = []
    all_tweet_image_titles = []
    num  = 0
    for i, row in df_query.iterrows():
        print("Processing tweet number ", num , " with id ", row[TWEET_ID_COLUMN])
        tweet = row[TWEET_TEXT_COLUMN]
        tweet, tweet_image_shortened_urls, tweet_image_best_guesses = get_information_out_of_URLs(tweet)
        cleaned_tweets.append(tweet)
        all_tweet_image_shortened_urls.append(tweet_image_shortened_urls)
        all_tweet_image_best_guesses.append(tweet_image_best_guesses)
        num = num + 1

    df_query["cleaned"] = cleaned_tweets
    df_query["shortened_tweet_urls"] = all_tweet_image_shortened_urls
    df_query["best_guesses"] = all_tweet_image_best_guesses
    df_query.to_excel(output_save_path, index=False)



In [6]:
# define some constants.
TWEET_ID_COLUMN = cf.TWEET_ID
TWEET_TEXT_COLUMN = cf.TWEET_TEXT

Just to check if the MRIZA is working

In [14]:

tweet_id = "1214905880732459009" # us military draft email 
api = get_api()
print(get_image_best_guess_from_tweet_id(api, tweet_id))


us military draft email 


### Arabic, CLEF 2022: Cleaning train, dev, and test sets from URLs and replace URLs with useful information

In [6]:

train_query_path = cf.AR_2022_TRAIN_QUERY
dev_query_path = cf.AR_2022_DEV_QUERY
test_query_path = cf.AR_2022_TEST_QUERY

# df_query = clean_queries_from_urls(train_query_path, output_save_path=gb.AR_2022_CLEANED_TRAIN_QURRIES)
# df_query = clean_queries_from_urls(dev_query_path, output_save_path=gb.AR_2022_CLEANED_DEV_QURRIES)
df_query = clean_queries_from_urls(test_query_path, output_save_path=cf.AR_2022_CLEANED_TEST_QURRIES)

---------------------------------------- Extracting information from tweet URLs using the query path:  ./data/CLEF_2022/Arabic/CT2022-Task2A-AR-Test_Queries.tsv
 ---------------------------------------------------------------------------------------------------------
Processing tweet number  0  with id  tweet-ar-22-test-0
Processing tweet number  1  with id  tweet-ar-22-test-1
Processing tweet number  2  with id  tweet-ar-22-test-2
Processing tweet number  3  with id  tweet-ar-22-test-3
Processing tweet number  4  with id  tweet-ar-22-test-4
Processing tweet number  5  with id  tweet-ar-22-test-5
Processing tweet number  6  with id  tweet-ar-22-test-6
Processing tweet number  7  with id  tweet-ar-22-test-7
Processing tweet number  8  with id  tweet-ar-22-test-8
Processing tweet number  9  with id  tweet-ar-22-test-9
Processing tweet number  10  with id  tweet-ar-22-test-10
Processing tweet number  11  with id  tweet-ar-22-test-11
Processing tweet number  12  with id  tweet-ar-22-test-1

## -----------------------------------------------------------

### English, CLEF 2022: Cleaning train, dev, and test sets from URLs and replace URLs with useful information

In [7]:

train_query_path = cf.EN_2022_TRAIN_QUERY
dev_query_path = cf.EN_2022_DEV_QUERY
test_query_path = cf.EN_2022_TEST_QUERY

# df_query = clean_queries_from_urls(train_query_path, output_save_path=gb.EN_2022_CLEANED_TRAIN_QURRIES)
# df_query = clean_queries_from_urls(dev_query_path, output_save_path=gb.EN_2022_CLEANED_DEV_QURRIES)
df_query = clean_queries_from_urls(test_query_path, output_save_path=cf.EN_2022_CLEANED_TEST_QURRIES)

---------------------------------------- Extracting information from tweet URLs using the query path:  ./data/CLEF_2022/English/CT2022-Task2A-EN-Test_Queries.tsv
 ---------------------------------------------------------------------------------------------------------
Processing tweet number  0  with id  tweet-sno-1198
Getting image info for this url :  https://twitter.com/timkmak/status/1500822536346603531/photo/1
Processing tweet number  1  with id  tweet-sno-1199
Processing tweet number  2  with id  tweet-sno-1200
Getting image info for this url :  https://twitter.com/FedorovMykhailo/status/1498392515262746630/photo/1
Processing tweet number  3  with id  tweet-sno-1201
Getting image info for this url :  https://twitter.com/FedorovMykhailo/status/1501648932824301570/photo/1
Processing tweet number  4  with id  tweet-sno-1202
Processing tweet number  5  with id  tweet-sno-1203
Getting image info for this url :  https://twitter.com/travisakers/status/1499041366696992772/video/1
Process