## Import packages

In [None]:
import tweepy
import configparser
import pandas as pd
import itertools

## Define useful functions

In [None]:
# define a functions that deals with user filtering and user addition
def users_handler(user_info: tweepy.models.User, filters: dict):
    """
    this function is used to filter the given tweepy.models.User instance by performing the following processes:
        1- extracting the user bio/description.
        2- check if the bio contains at least one of the keywordws present in filters dict.
        3- check if the bio does not contains any word of the unwanted keywordws present in filters dict
        4- check if the user follwers count are greater than the limit present in filters dict.

    
    :param: user_info -- an instance of tweepy.models.User class contains the information about collected user.
    :param: filters -- a dictionary that contains the filters which the user will be filtered against.
    
    :return: instance of tweepy.models.User class contains the information about the passed user.
        
    """
    # extracting user info 
    user_bio = user_info.description.lower()
    user_follower_count = user_info.followers_count
    
    if any(word.lower() in user_bio.split() for word in filters['keywords']):
        if not any(undesired_word.lower() in user_bio.split() for undesired_word in filters['unwanted keywords']):
            if user_follower_count > filters['followers_count']:
                return user_info
            else:
                return -1
        else:
            return -1            
    else:
        return -1    

In [None]:
def users_adder(main_user_dict:dict, user_info: tweepy.models.User):
    """
    this function is used to add the information of the passed user to the df
    :param: main_user_dict -- this is the main dictionary that contains the information about the passed users
                                {Username:[list of usernames per user], Bio:[list of bio per user],
                                profile URL:[list of profile urls per user], Location:[list of locations per user]
                                Websites:[list of user website urls]} 
    :param: user_info -- instance of tweepy.models.User class contains the information about the passed tweet.
    
    :return: dictionary that contains the passed user info and integer that represent how many user are collected.
    """

    user_name = user_info.screen_name
    
    user_bio = user_info.description.lower()
    
    user_url = "https://twitter.com/{}".format(tweet.screen_name)
    
    user_location = user_info.location.lower() 
    
    user_website = user_info.url
    
    
    if not (user_name in main_user_dict['Username']): # cehck to not include duplicate data
        
        main_user_dict['Username'].append(user_name)
        
        main_user_dict['Bio'].append(user_bio)
        
        main_user_dict['profile URL'].append(user_url)
        
        main_user_dict['Location'].append(user_location)
        
        main_user_dict['Websites'].append(user_website)
    
   
    return main_user_dict

## Authentication

Before running the below cell you will need to create a config file named config.ini which includes the Twitter API credentials. the structure of the file is like the following:
```config

[twitter]

api_key = 
api_key_secret = 

access_token= 
access_token_secret= 


```

In [None]:
# read config
config = configparser.ConfigParser()
config.read('config.ini')

api_key = config['twitter']['api_key']
api_key_secret = config['twitter']['api_key_secret']

access_token = config['twitter']['access_token']
access_token_secret = config['twitter']['access_token_secret']

# authentication
auth = tweepy.OAuthHandler(api_key, api_key_secret)
auth.set_access_token(access_token, access_token_secret)


In [None]:
api = tweepy.API(auth ,wait_on_rate_limit=True)

public_tweets = api.home_timeline()

## Algorithm description:  

the general idea to find the user of interest is: to use the `api.search_users` which is similar to Find People button on Twitter.com; the same results returned by people search on Twitter.com will be returned by using this API.   

we will take the following steps:
1. create a dictionary that contains:
    - words that we want our user's bio to include
    - min number of followers of each user
    - words that we do not want to include in our search
2. generate a combination of 2 words from the previously created keywords
3. initialize a main dictionary its keys represents the required info to be collected about the users
4. looping on the created combination; and for each combination:
    - create the search query
    - for each collected user given a combination:
        - check if the user passes the specified criteria by utilizing `users_handler()`
        - if the user pass, add the collected info to `main_user_dict`
        - break from the loop
5. create a data frame from the generated dictionary and save the file as CSV  

In [None]:
# setting the filters up dictionary 
filters = {'keywords':['CEO', 'vice president', 'president',
                      'chief', 'founder', 'co funder', 'CTO', 'Congress Women', 'Congress men',
                      'senator', 'MP', 'parliament', 'head', 'senior', 'Activist', 'creator', 'board member',
                      'Chairman', 'VP'],
           'unwanted keywords': ['sex', 'porn', 'adult', 'PLAYMATE', 'Model'],
           'followers_count':10000,}

# generating combination of the desired words 2 at a time
desired_words_combinations = list(itertools.combinations(filters['keywords'], 2))

# setting the unwanted words in tweets
undsired_words = ' -'.join(filters['unwanted keywords'])

main_user_dict = {'Username':[], 'Bio':[], 'profile URL':[], 'Location':[], 'Websites':[]}

for word in desired_words_combinations:
    # setting the query
    desired_words = ' OR '.join(list(word))
    query = '({}) -{} lang:en'.format(desired_words, undsired_words)
    print("search query is: {} \n".format(query))
    
    tweets = tweepy.Cursor(api.search_users, q=query , count=20, include_entities=True).items(500)
    for i, tweet in enumerate(tweets):
        responce = users_handler(tweet, filters)
        if responce != -1 :
            main_user_dict = users_adder(main_user_dict, tweet)    
                

In [None]:
# create a df from the main_user_dict
df = pd.DataFrame(main_user_dict)

In [None]:
# explore df
df

In [None]:
# make sure there is no duplicate data
df.drop_duplicates(inplace=True)

In [None]:
file_name = "Twitter_Users_Info_Africa"
df.to_csv("{}.csv".format(file_name), index=False)