In [1]:
import twitter
import json
import urllib.request
import os
import subprocess
from google.cloud import videointelligence
from google.oauth2 import service_account
from google.protobuf.json_format import MessageToJson

In [2]:
import pymongo
from pymongo import MongoClient
from bson import json_util
# Use default client which will defaults to the MongoDB instance 
# that runs on the localhost interface on port 27017
client = MongoClient()
db = client.tweetDB


In [4]:
tweetDB = db.tweetDB

In [5]:
class InvalidMediaException(Exception):
    pass

class InvalidCredentialsException(Exception):
    pass


In [6]:
def get_timeline_media_urls(screen_name, count=200, exclude_replies=True): 
    """Get list of jpg urls found in media associated with tweets from a specific twitter accounts timeline

    Args:
        screen_name (str): Twitter screen name associated with desired timeline

    Keyword Arguments (optional):
        count (int): Number of tweets to look through (capped at 200 per api limits). Default 200.
        exclude_replies (bool): Exclude media found in tweets where specified user only replied. Default: True

    Returns:
        list: All .jpg image urls found in the twitter feed, as strings. 

    """
    with open("keys.dat") as f:
        keys = f.read().split()
    try:
        api = twitter.Api(consumer_key= keys[0],
                    consumer_secret=keys[1],
                    access_token_key=keys[2],
                    access_token_secret=keys[3])
    except:
        raise InvalidCredentialsException("Invalid twitter credentials")

    try:
        res = api.GetUserTimeline(screen_name=screen_name, count=count, trim_user=True, exclude_replies=exclude_replies)
    except Exception as e:
        raise e
    images = []
    for tweet in res:
        js = tweet._json["entities"]
        if "media" in js.keys():
            for media in js["media"]:
                if media["media_url"][-3:] == "jpg":
                    images.append(media["media_url"])
    if len(images) == 0:
        raise InvalidMediaException("No valid media found for screen_name: " + screen_name)
    return images

In [7]:
def urls_to_movie(images, output="output.mp4"):
    """Generate local mp4 file from a list of urls, with 1 sec per images

    Args:
        images (list): List of images to include in movie, as strings

    Keyword Arguments (optional):
        output: Ouput filename for video. Default: output.mp4

    Returns:
        str: Output filename used by ffmpeg, in event provided filename was in use

    """
    count = 0
    while os.path.isfile(output):
        output = output.split(".")[0] + "(" + str(count) + ")." + output.split(".")[1]
        count += 1

    for i in range(len(images)):
        try:
            urllib.request.urlretrieve(images[i], "tmp_{}.jpg".format(str(i).zfill(4)))
        except Exception as e:
            raise e

    for i in range(len(images)):
        try:
            os.system(('''ffmpeg -loop 1 -i tmp_{}.jpg -c:a libfdk_aac -ar 44100 -ac 2 -vf "scale='if(gt(a,16/9),1280,-1)':'if(gt(a,16/9),-1,720)', pad=1280:720:(ow-iw)/2:(oh-ih)/2" -c:v libx264 -b:v 10M -pix_fmt yuv420p -r 30 -shortest -avoid_negative_ts make_zero -fflags +genpts -t 1 tmp_{}.mp4''').format(str(i).zfill(4) , str(i).zfill(4)))
        except Exception as e:
            raise e

    with open("tmp_files.txt", "w") as f:
        for i in range(len(images)):
            f.write("file 'tmp_{}.mp4'\n".format(str(i).zfill(4)))

    try:
        os.system("ffmpeg -f concat -i tmp_files.txt " + output)

    except Exception as e:
        raise e

    #cleanup temp files
    for i in range(len(images)):
        os.remove("tmp_{}.jpg".format(str(i).zfill(4)))
        os.remove("tmp_{}.mp4".format(str(i).zfill(4)))
    # os.remove("tmp_files.txt".format(str(i).zfill(4)))

    return output


In [8]:
def video_analysis(filename):
    """Generate list of labels for a specified mp4 file, using Google cloud ideo intelligence

    Ouput is of form: 
        [{start: 0, end: 1, labels: [("cat", .56), ("animal>dog", .2)]}]
        Each labels is broken up by (category > categy > ... > entity , confidence level)

    Args:
        filename (str): Filename of input .mp4 file

    Returns:
        list: list of segments and labels, sorted by start time of each shot

    """
    credentials = service_account.Credentials.from_service_account_file(
        'googe.dat')
    try:
        client = videointelligence.VideoIntelligenceServiceClient(
            credentials=credentials
        )
    except Exception as e:
        raise e

    try:
        with open(filename, "rb") as f:
            video_data = f.read()
    except Exception as e:
        raise e

    try:
        result = client.annotate_video(
            input_content=video_data,
            features=['LABEL_DETECTION'],
        ).result()
    except Exception as e:
        raise e

    return result

In [9]:
def get_twitter_media_analysis(screen_name, count=200, exclude_replies=True, output_name="output.mp4", delete_movie=True):
    """Generate list of labels from the video anaylsis of a specified users twitter timeline

    Ouput is of form: 
        [{start: 0, end: 1, labels: [("cat", .56), ("animal>dog", .2)]}]
        Each labels is broken up by (category > categy > ... > entity , confidence level)

    Args:
       screen_name (str): Twitter screenname associated with desired timeline

    Keyword Arguments (optional):
       count (int): Number of tweets to look through (capped at 200 per api limits). Default: 200
       exclude_replies (bool): Exclude media found in tweets where specified user only replied. Default: True
       output_name (str): Filename of input .mp4 file. Default: output.mp4
       delete_movie (bool): Specified whether or not to remove local file after analysis. Default: True

    Returns:
        list: list of segments and labels, sorted by start time of each shot

    """
    images = get_timeline_media_urls(screen_name, count, exclude_replies)
    output_filename_actual = urls_to_movie(images, output=output_name)
    result = video_analysis(output_filename_actual)
#     if delete_movie:
#         os.remove(output_name)

    analysis_json = json.loads(MessageToJson(result)) 
#     print(analysis_json)
    imgage_idx = 1
    for shot_label in analysis_json["annotationResults"][0]["shotLabelAnnotations"]:
        entity = [shot_label["entity"]["description"]]

        for segment in shot_label["segments"]:
            entity.append(segment["confidence"])

        result = tweetDB.insert_one(
            {
            "imageInfo": [{"imgage_idx":imgage_idx, "decription": entity[0], "confidence": entity[1]}         
            ]
            }
        )
        imgage_idx +=1
    
    return analysis_json


In [10]:
from datetime import datetime
result = tweetDB.insert_one({"Author": "wallpapermag"})

In [11]:
if __name__ == "__main__":
    get_twitter_media_analysis("wallpapermag", count=30)

In [12]:
cursor = tweetDB.find()
for ele in cursor:
    print(ele)


{'_id': ObjectId('5ac53247f192bf2f09d0fd77'), 'Author': 'wallpapermag'}
{'_id': ObjectId('5ac53270f192bf2f09d0fd78'), 'imageInfo': [{'imgage_idx': 1, 'decription': 'restaurant', 'confidence': 0.6899509429931641}]}
{'_id': ObjectId('5ac53270f192bf2f09d0fd79'), 'imageInfo': [{'imgage_idx': 2, 'decription': 'metropolitan area', 'confidence': 0.8535047769546509}]}
{'_id': ObjectId('5ac53270f192bf2f09d0fd7a'), 'imageInfo': [{'imgage_idx': 3, 'decription': 'graphic design', 'confidence': 0.48571425676345825}]}
{'_id': ObjectId('5ac53270f192bf2f09d0fd7b'), 'imageInfo': [{'imgage_idx': 4, 'decription': 'building', 'confidence': 0.8090547323226929}]}
{'_id': ObjectId('5ac53270f192bf2f09d0fd7c'), 'imageInfo': [{'imgage_idx': 5, 'decription': 'bicycle handlebar', 'confidence': 0.7625961303710938}]}
{'_id': ObjectId('5ac53270f192bf2f09d0fd7d'), 'imageInfo': [{'imgage_idx': 6, 'decription': 'land vehicle', 'confidence': 0.6654285192489624}]}
{'_id': ObjectId('5ac53270f192bf2f09d0fd7e'), 'imageInfo'

In [13]:
remove_all = tweetDB.delete_many({})
remove_all.deleted_count

64

In [14]:
tweetDB.drop()