In [4]:
from twikit import Client, TooManyRequests
from account_info import USERNAME, EMAIL, PASSWORD
import pandas as pd
import json
import urllib.request
import os
import time
import requests



# Initialize client
client = Client('en-US')

client.login(
    auth_info_1=USERNAME ,
    auth_info_2=EMAIL,
    password=PASSWORD
)

# save cookies in order to pull data without getting banned
client.save_cookies('cookies.json')

In [5]:
# loads the cookies that were saved
client.load_cookies('cookies.json')

In [6]:
def download_image(URL, save_as):
    '''Function uses image URL and saves the image onto the desired path and file type'''
    urllib.request.urlretrieve(URL,save_as)

def delete_jpg(file):
    '''This file takes in the file path and deletes it.'''
    if os.path.exists(file):
        os.remove(file)
    else:
        print('File not found')

# this saves the json file on the described directory 
def save_json(file, file_path):
    with open(file_path, "w") as f:
        json.dump(file, f,indent=4)


def retry_on_rate_limit_error(func, *args, **kwargs):
    max_retries = 5
    retries = 0
    while retries < max_retries:
        try:
            return func(*args, **kwargs)
        except TooManyRequests as e:
            print("Rate limit exceed trying again in 60 sec")
            time.sleep(60)
            retries += 1
    raise Exception("Max tries reached")


In [51]:

# file path to excel sheet
file_path = r"C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\src\utils\codes.xlsx"

def keywords_to_dict(file_path =r"C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\src\utils\codes.xlsx" ):
    '''This function takes in the excel sheet that has keywords and its ids split up into columns.
    Then it creates two dictionaries, keyname and keyname_id. The values are numbered 0 to the 
    length of the columns. Once created they return these dictionaries.'''

    # reads in excel sheet into pandas data frame
    df = pd.read_excel(file_path)

    # this turns the street name and its code name into list 
    street_name_list = df['keyname'].astype(str).values.tolist()
    street_name_code_list = df['keyname_id'].astype(str).values.tolist()

    # this dictionary contains the keyname in the keys and keyname id in values
    keyname = {}
    keyname_id = {}
    for i in range(0,13):
        keyname[i] = street_name_list[i]
        keyname_id[i] = street_name_code_list[i]
        
    return keyname, keyname_id

# print(json.dumps(keyname, indent =4))
keyname, keyname_id = keywords_to_dict()
print(keyname)

{0: 'amphetamine', 1: 'Cocaine ', 2: 'crack cocaine', 3: 'fentanyl and fentanyl derivatives', 4: 'GHB', 5: 'Heroin', 6: 'Hydrocodone', 7: 'Klonopin', 8: 'LSDtrip', 9: 'Marijuana', 10: 'Marijuana Concentrates', 11: 'MDMA', 12: 'Mescaline'}


In [27]:
save_path_list = []

# all the file paths for all the structures in the end 
save_path_list.append(r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\data_structures\user_structure.json')
save_path_list.append(r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\data_structures\post_structure.json')
save_path_list.append(r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\data_structures\comment_structure.json')
save_path_list.append(r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\data_structures\picture_strucutre.json')
save_path_list.append(r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\data_structures\keyword_structure.json')
save_path_list.append(r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\data_structures\relation_structure.json')



In [54]:
def keyword_search_in_users(keyname_to_search, list_of_users, list_to_append):
    """This function takes in a keyname to search a list of users. It will search the users name,
    keyname, description and tweets for the keyname. If it finds it on the tweets it appends
    to the list_to_append"""


    # gets a user from the list_of_users and goes through their name, screen name 
    # and description to find keywords. If they find one they append the user to the list
    for user in list_of_users:
        if len(list_to_append) == 30:
            break
        elif ('user_' + user.id) not in list_to_append:
            if keyname_to_search in user.name:
                list_to_append.append('user_' +  user.id)
            elif keyname_to_search in user.screen_name:
                list_to_append.append('user_' + user.id)
            elif keyname_to_search in user.description:
                list_to_append.append('user_' + user.id)
            # else:
            #     # get first 5 tweets and compare to keyname 
            #     tweets = user.get_tweets('tweets', 5)
            #     for tweet in tweets:
            #         if keyname_to_search in tweet.text:
            #             list_to_append.append('user_' + user.id)
    return list_to_append

def keyword_search_in_tweets(tweet):
    """This function takes in tweets, keyname, and keyname ids. Then 
    it goes thourgh the keynames and trys to find it in the text of the 
    tweet. If it finds it appends it to a list and at the end it returns this
    list."""

    keyname, keyname_id = keywords_to_dict()
    keyname_found_in_tweet = []

    # find keyword in tweet text, if so append to list
    for keys, keyword in keyname.items():
        if keyword in tweet.full_text:
            keyname_found_in_tweet.append(keyname_id[keys])
    
    return keyname_found_in_tweet


def extract_tweet_picture_structure(user, tweet, pic_id_starter):
    """This function takes in the type user and tweet. Then goes down the post
    structure sorting out the data. Finally it returns the post_structure"""

    # retrieve keywords found in tweet
    keyname_found = keyword_search_in_tweets(tweet)
           
    # construct post structure
    Post_Structure = {
        "user_id": 'user_' + str(user.id),
        "post_id": 'tweet_' + str(tweet.id),
        "user_comment": tweet.full_text,
        "pic_id": "pic_id1",
        "liked_users": [ 'user_' + str(favoriter.id) for favoriter in tweet.get_favoriters(20)],
        "comments": '',
        "keywords": keyname_found,
    }
    if tweet.has_card:
        # image_directoary = r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\tweet_images'
        # image_file_name = 'tweet_image' + str(tweet.id) + '.jpg'
        # image_save_path = os.path.join(image_directoary, image_file_name)
        # download_image(tweet.thumbnail_url, image_save_path)

        Picture_Structure = {
            "pic_id": 'pic_' + str(pic_id_starter),
            "post_id": 'tweet_' + str(tweet.id),
            "url": tweet.thumbnail_url,
        }
        pic_id_starter += 1
        return Post_Structure, Picture_Structure, pic_id_starter
    else:
        Picture_Structure = {}
        return Post_Structure, Picture_Structure, pic_id_starter



In [56]:
keyname, keyname_id = keywords_to_dict()
search_key = keyname[0]

user_keyword = retry_on_rate_limit_error(client.search_user,'Amphetamine', 10)

# all data structures 
users = {}
posts = {}
comments = {}
pictures = {}
keywords = {}
relations = {}

pic_id_num_starter = 20000000000
picture_id_dict = {}
for user in user_keyword:

    try: 
        # save the user progile pic 
        image_directoary = r'C:\Users\16822\Research Project SER\SEP-NHANES\lmolina3\data\data_for_twitter\profile_images'
        image_file_name = 'user_profile_pic_' + str(user.id) + '.jpg'
        image_save_path = os.path.join(image_directoary, image_file_name)
        download_image(user.profile_image_url, image_save_path)

        keywords = []

        # keyname contains the keynames and keyname_id contains their ids
        keyname, keyname_id = keywords_to_dict()

        # get the users tweets
        tweets = user.get_tweets('tweets')

        # gets users followers returns a list of users, return user type 
        followers = user.get_followers(user.id)

        # gets users following returns a list of users, return user type
        following = user.get_following(user.id)

        # empty list for the followers
        followers_list = []
        following_list = []

        # go through the users tweets, description, name to find keywords
        for keys, keyname in keyname.items():
            if keyname in user.name:
                keywords.append(keyname_id[keys])
            elif keyname in user.screen_name:
                keywords.append(keyname_id[keys])
            elif keyname in user.description:
                keywords.append(keyname_id[keys])
            else:
                for tweet in tweets:
                    if keyname in tweet.text:
                        keywords.append(keyname_id[keys])

            # go through the users followers description, tweet, and names to 
            # see if it finds any keywords. If it does it adds them to the follower_list
            followers_list = keyword_search_in_users(keyname, followers, followers_list)

            # Goes thorugh the users following description, tweet, and names to 
            # see if it finds any keywords. If it does it adds them to the following_list
            following_list = keyword_search_in_users(keyname, following, following_list)

        # Structure to store the data
        User_Structure = {
            "username": user.screen_name,
            "user_id": 'user_' + str(user.id), 
            "followers": [follower for follower in followers_list],
            "followees": [following for following in following_list], #['user_' + str(followees_id) for followees_id in client.get_friends_ids(user.id,user.screen_name,30)]
            "profile_pic": image_save_path,
            "profile_text": user.description,
            "posts": ['tweet_' + tweet.id for tweet in tweets]  ,
            "keywords": keywords
        }
        
        # go through each tweet in users tweet 
        for tweet in tweets:
            tweet_id_name = 'tweet_' + str(tweet.id)

            # check to see if tweet id not in post_strucutre if not create one for the tweet
            if tweet_id_name not in posts:

                # extract the post structure form the tweet and comments 
                original_pic_id_num = pic_id_num_starter
                post_structure, picture_structure, pic_id_num_starter = extract_tweet_picture_structure(user, tweet, pic_id_num_starter)
                posts[tweet_id_name] = post_structure
                pictures['pic_' + str(original_pic_id_num)] = picture_structure
        # save the user to the dictionary
        user_id_name = 'user_'+ user.id
        if user_id_name not in users:
            users[user_id_name] = User_Structure

        # list that contains all data structures 
        data_structure_for_twitter = [users, posts, comments, pictures, keywords, relations]
        save_json(users, save_path_list[0])
        save_json(posts, save_path_list[1])
        save_json(pictures, save_path_list[3])
        
    except TooManyRequests as e:
        print('Rate limit exceeced')
        break




In [None]:
print(users)

# TODO: Create the rest of the structures
# TODO: Create all of the structures under the same for loop and function. Then create the function to loop over every 15 min. 