In [1]:
from collections.abc import Callable # For creating type hint for decorator function
from dotenv import load_dotenv # For getting .env params
import pandas as pd # For storing infromation about friends
import numpy as np # For making operations with numpy arrays
import os # For getting .env params
import functools # For correct showing docstring of decorated function
import requests # For sending requests to API
import time # For making delay between API calls
import json # For formatting response data
import re # For checking date of birth format

In [2]:
load_dotenv()
class APIValues():
    """Class which stores important api constants"""
    TOKEN = os.getenv("VK_TOKEN")
    VERSION = os.getenv("API_VERSION")


In [3]:
def api_response_cacher(func: Callable) -> Callable:
    """Decorator for caching api requests responses"""
    
    @functools.wraps(func)
    def wrapped(*args):
        response_data = wrapped.previous_request.get(args, None)
        if not response_data is None:
            print('Request with args={} was already made'.format(*args))
            return response_data
        
        result = func(*args)
        wrapped.previous_request[*args] = result
        return result
    
    wrapped.previous_request = {}
    return wrapped

### Получение даты рождения друзей пользователя

In [4]:
@api_response_cacher
def get_friends_bdate_dataframe(user_id: str) -> pd.DataFrame:
    """
    
    Creates dataframe, that contains 
    date of birth of every user friend 
    
    Args:
        user_id (str): vk id of a user
    
    Returns:
        pd.DataFrame, that contains information about 
        friends date of birth with this format:
        __________________________________
        |        id        |   bdate     |
        |------------------|-------------|
        |  friend_id(str)  |  date(str)  |
    
        
    Raises:
        requests.exceptions.HTTPError: request problem
        Exception: problem with processing friends data
        
    """
    request_url = 'https://api.vk.com/method/friends.get'
    params = {
        'user_id' : user_id,
        'v' : APIValues.VERSION,
        'access_token' : APIValues.TOKEN,
        'fields' : 'bdate'
    }
    try:
        response = requests.get(request_url, params=params)
        response.raise_for_status()
        request_json = json.loads(response.text)
        # Extracting from response json needed user infromation
        friend_info_dict = [{'id': i_friend['id'], 'bdate': str(i_friend.get('bdate', 0))} 
                           for i_friend in request_json['response']['items']]
        
    except requests.exceptions.HTTPError as err:
        raise requests.exceptions.HTTPError(err)
    except Exception as err:
        raise Exception(err)

    return pd.DataFrame(friend_info_dict)

In [5]:
friends_df = get_friends_bdate_dataframe('534920918')
friends_df

Unnamed: 0,id,bdate
0,141706715,5.10
1,153334062,27.3
2,162119115,24.9.2002
3,176039419,16.9.2002
4,187713925,0
5,196316604,15.8.2002
6,199427361,15.12
7,215585373,4.1
8,219352292,0
9,220219923,24.7


### Фильтрация друзей с некорректной датой рождения

In [6]:
def check_function(df: pd.DataFrame, 
                   year_range: tuple[int], 
                   pattern: str) -> bool:
    """
    Subfunction for checking if 
    the date of birth is correct
    """
    if (re.fullmatch(pattern, df['bdate']) and 
        year_range[0] < int(df['bdate'][-4:]) < year_range[1]):
        return True
    return False
    

def filter_by_bdate(friends_df: pd.DataFrame, 
                    min_year=1960, 
                    max_year=2014)-> pd.DataFrame:
    """
    
    Extracts from dataframe lines 
    with correct date of birth
    
    Args:
        friends_df(pd.DataFrame): DataFrame with 
        date of birth infromation and next format:
        __________________________________
        |        id        |    bdate    |
        |------------------|-------------|
        |  friend_id(str)  |  date(str)  |
        
        min_year(int) - min birth year that is considered valid
        max_year(int) - max birth year that is considered valid
    
    Returns:
        pd.DataFrame, that contains lines of 
        input dataframe with correct date of birth
    
    """
    
    pattern = r'\d{1,2}\.\d{1,2}\.\d{4}'
    correct_bdates = friends_df[friends_df.apply(check_function, # Filters dataframe with boolean expression
                                                 pattern=pattern, 
                                                 year_range=(min_year, max_year),
                                                 axis=1)].copy() # Copying to create new dataframe, instead of slice
    
    return correct_bdates

In [7]:
good_notes = filter_by_bdate(friends_df)

In [8]:
good_notes

Unnamed: 0,id,bdate
2,162119115,24.9.2002
3,176039419,16.9.2002
5,196316604,15.8.2002
11,223503014,24.12.2002
12,225245840,11.4.2002
13,231277366,16.9.2002
18,300473846,11.7.2002
19,302126849,20.6.2002
20,302552381,20.4.2001
22,353339780,17.4.2002


In [9]:
good_notes['bdate'] = (good_notes.bdate.apply(lambda x: 2023 - int(x[-4:])))
good_notes = good_notes.rename(columns={'bdate' : 'age'})
good_notes

Unnamed: 0,id,age
2,162119115,21
3,176039419,21
5,196316604,21
11,223503014,21
12,225245840,21
13,231277366,21
18,300473846,21
19,302126849,21
20,302552381,22
22,353339780,21


### Прогнозирование возраста пользователя

In [10]:
predicted_age = good_notes.age.mean()
print('Предположительный возраст пользователя:', round(predicted_age))

Предположительный возраст пользователя: 21


### (ДОП) Использование записей с неверной датой рождения для улучшения прогнозирования

In [11]:
bad_notes = friends_df.drop(good_notes.index, axis=0)

In [12]:
bad_notes

Unnamed: 0,id,bdate
0,141706715,5.1
1,153334062,27.3
4,187713925,0.0
6,199427361,15.12
7,215585373,4.1
8,219352292,0.0
9,220219923,24.7
10,222085400,29.1
14,235379111,21.12
15,254793309,0.0


In [13]:
def get_guessed_age_df(bad_ids: list[int])-> pd.DataFrame:
    """
    
    Creates dataframe with guessed ages for 
    users with invalid date of birth
    
    Args:
        bad_ids(list[int]) - list of users that have invalid bdate

    Returns: 
        pd.DataFrame, with next format:
        _________________________________________
        |        id        |        age         |
        |------------------|--------------------|
        |  friend_id(str)  |  guessed_age(str)  |
    

    """
    guessed_ages = []
    for u_id in bad_ids[:]:
        time.sleep(0.36)
        try:
            friend_friends = get_friends_bdate_dataframe(str(u_id))

            correct_lines = filter_by_bdate(friend_friends)
            correct_lines['bdate'] = (correct_lines.bdate.apply(lambda x: int(x[-4:])))
            user_age = 2023 - correct_lines['bdate'].mean()
        
        # If we can't guess friends age, then don't use his id in calculations
        except Exception as exc:
            bad_ids = np.delete(bad_ids, np.where(bad_ids == u_id))
        else:
            guessed_ages.append(round(user_age, 2))
    guessed_data = pd.DataFrame({'id' : bad_ids, 'age' : guessed_ages})
    return guessed_data
           

In [14]:
guessed_ages = get_guessed_age_df(bad_notes.id.values)
guessed_ages

Unnamed: 0,id,age
0,141706715,22.86
1,153334062,24.21
2,187713925,20.89
3,199427361,25.0
4,215585373,25.62
5,219352292,21.28
6,220219923,22.1
7,222085400,24.78
8,235379111,25.0
9,254793309,27.0


In [15]:
friends_ages_final_df = pd.concat([good_notes, guessed_ages], 
                                 axis=0, 
                                 ignore_index=True)

In [16]:
friends_ages_final_df

Unnamed: 0,id,age
0,162119115,21.0
1,176039419,21.0
2,196316604,21.0
3,223503014,21.0
4,225245840,21.0
5,231277366,21.0
6,300473846,21.0
7,302126849,21.0
8,302552381,22.0
9,353339780,21.0


In [17]:
predicted_age = friends_ages_final_df.age.mean()
print('Предположительный возраст пользователя:', round(predicted_age))

Предположительный возраст пользователя: 23
