In [None]:
#try polyfuzz
from fuzzywuzzy import fuzz 
from twitterCrawler.myClass import TwitterDriver
from facebookCrawler.myClass import FacebookDriver
import os
import face_recognition
import requests
import time
import numpy as np
from sklearn.cluster import DBSCAN
from collections import Counter
from flair.data import Sentence
from flair.models import SequenceTagger

In [2]:
import config
FB_LOGIN_NAME= config.get("FB_LOGIN_NAME")
FB_LOGIN_PWD= config.get("FB_LOGIN_PWD")
TW_LOGIN_NAME= config.get("TW_LOGIN_NAME")
TW_LOGIN_PWD= config.get("TW_LOGIN_PWD")

In [3]:
twitter=TwitterDriver()
facebook=FacebookDriver()

In [4]:
def get_user_info(profile,media_name):
    if media_name=='Twitter':
        twitter.get_browser(usr=TW_LOGIN_NAME,pwd=TW_LOGIN_PWD)
        user_info=twitter.get_user_info(user_id=profile) 
        twitter.driver.close()
    if media_name=='Facebook':
        facebook.get_browser(usr=FB_LOGIN_NAME,pwd=FB_LOGIN_PWD)
        user_info=facebook.get_user_info(user_id=profile) 
        facebook.driver.close()
    return user_info

def get_friends_list(username,media_name):
    
    if media_name=='Twitter':
        twitter.get_browser(usr=TW_LOGIN_NAME,pwd=TW_LOGIN_PWD)
        friend_list=twitter.get_followers(user_id=username,scroll=8)
        twitter.driver.close()
    if media_name=='Facebook':
        facebook.get_browser(usr=FB_LOGIN_NAME,pwd=FB_LOGIN_PWD)
        friend_list=facebook.get_friends(user_id=username)
        followers_list=facebook.get_followers(user_id=username)
        friend_list.extend(followers_list)
        facebook.driver.close()
    return friend_list

def get_profile_pic(profile,user_info):
    '''returns path of folder where profile pic is stored'''
    profile_pic=user_info['profile_pic']
    if not os.path.exists(f'./images/{profile}/DP'):
        os.makedirs(f'./images/{profile}/DP')
    img_content=requests.get(profile_pic).content
    file=open(f'./images/{profile}/DP/profile_pic.jpg','wb')
    file.write(img_content)
    currentDirectory = os.getcwd()
    return f'{currentDirectory}/images/{profile}/DP'
    
def get_cover_pic(profile,user_info):
    cover_pic=user_info['cover_pic']
    if not os.path.exists(f'./images/{profile}/DP'):
        os.makedirs(f'./images/{profile}/DP')
    img_content=requests.get(cover_pic).content
    file=open(f'./images/{profile}/DP/cover_pic.jpg','wb')
    file.write(img_content)
    return f'./images/{profile}/DP/cover_pic.jpg'

def get_media_pic(profile,media_name):
    if media_name=='Facebook':
        facebook.get_browser(usr=FB_LOGIN_NAME,pwd=FB_LOGIN_PWD)
        img_urls=facebook.get_images(user_id=profile,n=20)
        facebook.driver.close()
    if media_name=='Twitter':
        twitter.get_browser(usr=TW_LOGIN_NAME,pwd=TW_LOGIN_PWD)
        img_urls=twitter.get_images(user_id=profile)
        twitter.driver.close()
        
    if not os.path.exists(f'./images/{profile}/media') and img_urls:
        print('Downloading Image')
        try:
            os.makedirs(f'./images/{profile}/media')
        except OSError:
            print ("Creation of the directory failed")
    #download imageS
    for i,img_url in enumerate(img_urls):
        img_content=requests.get(img_url).content
        file=open(f'./images/{profile}/media/image{i+1}.jpg','wb')
        file.write(img_content)
    return f'./images/{profile}/media'
    

In [5]:
def username_matching(known_friend_name,match_friend_name):
    '''returns user_name match score'''
        
    match=fuzz.WRatio(known_friend_name,match_friend_name)
    print(f'name_match_score {known_friend_name}-{match_friend_name} is: ',match)
    if match>=90:
        matched=1
    elif 80 <= match < 90:
        matched=match/100
    elif 70 <= match < 80:
        matched=match/150
    else:
        matched=match/200
    return matched

def network_matching(known_friend,known_media,match_friend,match_media):
    
    no_of_matches=0
    
    known_friends_list=get_friends_list(known_friend,known_media)
    #print(known_friends_list)
    match_friends_list=get_friends_list(match_friend,match_media)
    #print(match_friends_list)
    for m1_friend in known_friends_list:
        for m2_friend in match_friends_list:
            #match the username
            match=fuzz.WRatio(m1_friend[0],m2_friend[0])
            #print(f'Friend Match score {match}: {m1_friend[0]} - {m2_friend[0]}')
            #88 for friends match , 90 for user_name match
            if match>88:
                print(f'Friend Match found {match}: {m1_friend[0]} - {m2_friend[0]}')
                no_of_matches+=1
    
    return no_of_matches  


In [6]:
#reduce tolerance to be more strict with face match
TOLERANCE=0.45
def face_matching(known_pic,unknown_pic):
    known_face_encodings=[]
    for known_profile_pic in os.listdir(known_pic):
        known_image = face_recognition.load_image_file(f'{known_pic}/{known_profile_pic}')
        known_encodings=face_recognition.face_encodings(known_image)
        print('Number of known faces:',len(known_encodings))
        if len(known_encodings)==1:
            known_face_encodings.append(known_encodings[0])
        elif len(known_encodings)>1: 
            for encodings in known_encodings:
                known_face_encodings.append(encodings)
    #print(known_face_encodings)
    #print(len(known_face_encodings[0]))       
    results=[]
    distance_score=0
    for unknown_profile_pic in os.listdir(unknown_pic):            
        unknown_image = face_recognition.load_image_file(f'{unknown_pic}/{unknown_profile_pic}')
        unknown_encodings=face_recognition.face_encodings(unknown_image)
        print('Number of unknown faces:',len(unknown_encodings))
        if len(unknown_encodings)==1:
            results.extend(face_recognition.compare_faces(known_face_encodings, unknown_encodings[0], TOLERANCE))
            print(results)
            one_distance= face_recognition.face_distance(known_face_encodings, unknown_encodings[0])
            #taking distance score less than 0.6(possible matching faces)
            distances=[distance for distance in one_distance if distance<0.6]
            distance_score+=len(distances)
        #more than one face in an image
        elif len(unknown_encodings)>1:
            for unknown_face_encodings in unknown_encodings:
                #print(len(unknown_face_encodings))
                one_result = face_recognition.compare_faces(known_face_encodings, unknown_face_encodings, TOLERANCE)
                results.extend(one_result)
                print(results)
                one_distance = face_recognition.face_distance(known_face_encodings, unknown_face_encodings)
                #taking distance score less than 0.6(possible matching faces)
                distances=[distance for distance in one_distance if distance<0.6]
                distance_score+=len(distances)
                
    #print(results)
    matches=sum(results)
    return matches,distance_score

#face_matching(f'/vagrant/Trial_Profile_matching/images/{FB_PROFILE}/DP','/vagrant/Trial_Profile_matching/images/{TW_PROFILE}/DP')

In [7]:
TOLERANCE=0.5

def face_clustering(known_pic,unknown_pic):
    known_face_encodings=[]
    for known_profile_pic in os.listdir(known_pic):
        if os.path.isfile(f'{known_pic}/{known_profile_pic}'):
            known_image = face_recognition.load_image_file(f'{known_pic}/{known_profile_pic}')
            known_encodings=face_recognition.face_encodings(known_image)
            #print('Number of known faces:',len(known_encodings))
            if len(known_encodings)==1:
                known_face_encodings.append(known_encodings[0])
            elif len(known_encodings)>1: 
                for encodings in known_encodings:
                    known_face_encodings.append(encodings)
                
    known_face_encodings=np.array(known_face_encodings)
    print('Total number of faces ', known_face_encodings.shape[0])
    
    # cluster the embeddings
    #eps is the maximum distance between 2 samples to form the cluster
    clt = DBSCAN(metric="euclidean",eps=0.47)
    clt.fit(known_face_encodings)
    
    # total number of unique faces found in the directory
    labelIDs = np.unique(clt.labels_)
    #labelid -1 is the outlier faces (not to be considered)
    numUniqueFaces = len(np.where(labelIDs > -1)[0])
    print(f"Clustered unique faces: {numUniqueFaces}")
    
    #count the number of unique labels
    counts=Counter(clt.labels_)
    #remove outlier face
    counts[-1]=0
    #get most common face label
    most_common_label=counts.most_common(1)[0]
    
    #get the index of most common faces
    idxs = np.where(clt.labels_ == most_common_label[0])[0]
    common_face_encodings=[]
    for i in idxs:
        #most common face encodings
        common_face_encodings.append(known_face_encodings[i])
    
    print('Number of images containing common face: ',len(common_face_encodings))
    
    results=[]
    distance_score=0
    for unknown_profile_pic in os.listdir(unknown_pic):            
        unknown_image = face_recognition.load_image_file(f'{unknown_pic}/{unknown_profile_pic}')
        unknown_encodings=face_recognition.face_encodings(unknown_image)
        #print('Number of unknown faces:',len(unknown_encodings))
        if len(unknown_encodings)==1:
            results.extend(face_recognition.compare_faces(common_face_encodings, unknown_encodings[0], TOLERANCE))
            #print(results)
            one_distance= face_recognition.face_distance(common_face_encodings, unknown_encodings[0])
            #taking distance score less than 0.6(possible matching faces)
            distances=[distance for distance in one_distance if distance<0.6]
            distance_score+=len(distances)
        #more than one face in an image
        elif len(unknown_encodings)>1:
            for unknown_face_encodings in unknown_encodings:
                #print(len(unknown_face_encodings))
                one_result = face_recognition.compare_faces(common_face_encodings, unknown_face_encodings, TOLERANCE)
                results.extend(one_result)
                #print(results)
                one_distance = face_recognition.face_distance(common_face_encodings, unknown_face_encodings)
                #taking distance score less than 0.6(possible matching faces)
                distances=[distance for distance in one_distance if distance<0.6]
                distance_score+=len(distances)
                
    #print(results)
    matches=sum(results)/4 if sum(results)>4 else sum(results)
    distance_score=distance_score/10 if distance_score>10 else distance_score
    return matches,distance_score
            
#face_clustering(f'/vagrant/Trial_Profile_matching/images/{FB_PROFILE}/media','/vagrant/Trial_Profile_matching/images/{TW_PROFILE}/media')

In [8]:
def entity_matching(fb_intros,tw_intro):
    # loading the NER tagger
    tagger = SequenceTagger.load('ner')
    fb_organisation=[]
    fb_location=[]
    for fb_intro in fb_intros:
        sentence = Sentence(fb_intro)
        # run NER over sentence
        tagger.predict(sentence)
        #get entities
        fb_entities=sentence.to_dict(tag_type='ner')['entities']
        for entity in fb_entities:
            #print(entity)
            if entity['labels'][0].value=='ORG':
                fb_organisation.append(entity['text'])
            
            if entity['labels'][0].value=='LOC':
                fb_location.append(entity['text'])
    #print(fb_organisation)
    #print(fb_location)
    
    tw_organisation=[]
    tw_location=[]  
    sentence = Sentence(tw_intro)
    # run NER over sentence
    tagger.predict(sentence)
    #get entities
    tw_entities=sentence.to_dict(tag_type='ner')['entities']
    for entity in tw_entities:
        #print(entity)
        if entity['labels'][0].value=='ORG':
            tw_organisation.append(entity['text'])

        if entity['labels'][0].value=='LOC':
            tw_location.append(entity['text'])
    #print(tw_organisation)
    #print(tw_location)
    
    from flair.embeddings import StackedEmbeddings
    from flair.embeddings import FlairEmbeddings
    
    flair_forward  = FlairEmbeddings('news-forward-fast')
    flair_backward = FlairEmbeddings('news-backward-fast')
    stacked_embeddings = StackedEmbeddings( embeddings = [ flair_forward-fast, 
                                                       flair_backward-fast])
    
    description_match=0
    for fb_org in fb_organisation:
        sentence1 = Sentence(fb_org)
        for tw_org in tw_organisation:
            sentence2 = Sentence(tw_org)
            
        #if organisation in tw_organisation:
        #    description_match+=1
    for location in fb_location:
        #if location in tw_location:
          #  description_match+=1        
    
    return description_match
    
    # iterate over entities and print
    #for entity in sentence.get_spans('ner'):
        #print(entity.tag, repr(entity))

#entity_matching(intros,tw_intro)

In [9]:
NAME_WEIGHT=0.3
DESC_WEIGHT=0.1
DP_WEIGHT=0.1
MEDIA_WEIGHT=0.2
NETWORK_WEIGHT=0.3

def find_match(fbprofile,twprofile):
    print('----Getting User Info-----')
    fb_user_info=get_user_info(fbprofile,'Facebook')
    tw_user_info=get_user_info(twprofile,'Twitter')
    print(f'The fb profile -{fbprofile} user info is: {fb_user_info}')
    print(f'The tw profile -{twprofile} user info is: {tw_user_info}')
    name_match_score=username_matching(fb_user_info['name'],tw_user_info['name'])
    print(f'The weighted name match score is: {name_match_score}')
    
    fb_intros=fb_user_info['Intro']
    tw_intro=tw_user_info['user_description']
    description_match_score=entity_matching(fb_intros,tw_intro)
    print(f'\nThe Description match score is: {description_match_score}')
    
    print('\n----Getting Friends list-----')
    network_match_score=network_matching(fbprofile,'Facebook',twprofile,'Twitter')
    print(f'The network match score is: {network_match_score}')
    
    print('\n----Getting Profile Pic-----')
    fb_profile_pic=get_profile_pic(fbprofile,fb_user_info)
    tw_profile_pic=get_profile_pic(twprofile,tw_user_info)
    fb_cover_pic=get_cover_pic(fbprofile,fb_user_info)
    tw_cover_pic=get_cover_pic(twprofile,tw_user_info)
    print('The fb Profile pic path is ', fb_profile_pic)
    profile_pic_match_score,profile_pic_distance_score=face_matching(fb_profile_pic,tw_profile_pic)
    print(f'The Profile pic match score is: {profile_pic_match_score} and probable face match score based on distance is: {profile_pic_distance_score}')
    
    if profile_pic_match_score>=1 and profile_pic_distance_score>=2:
        return f'--------Profile Match found : {fbprofile} - {twprofile}-----------'
    
    print('\n----Getting Media Images-----')
    fb_media_pic=get_media_pic(profile=fbprofile,media_name='Facebook')
    tw_media_pic=get_media_pic(profile=twprofile,media_name='Twitter')
    print('The fb Media pic path is ', fb_media_pic)
    
    print('----Clustering most common faces in media-----')
    media_match_score,media_distance_score=face_clustering(fb_media_pic,tw_media_pic)
    print(f'The media match score is: {media_match_score} and probable face match score based on distance is: {media_distance_score}')
    
    

In [12]:
FB_PROFILE="prasanth.anbalagan.9"
TW_PROFILE='prasanth_gma'

In [13]:
find_match(fbprofile=FB_PROFILE,twprofile=TW_PROFILE)

----Getting User Info-----
The fb profile -prasanth.anbalagan.9 user info is: {'name': 'Prasanth Anbalagan', 'Intro': ['Travaille chez Amazon Development Center, ChennaiÉtudes\xa0', ' B.Tech. à SRM Valliammai Engineering CollegeA étudié à AKT Academy Higher Secondary School', 'A étudié à Montfort Matriculation Higher Secondary School, TindivanamHabite à ', 'ChennaiDe ', 'GingeeSuivi par ', '141 personnes'], 'friends': 0, 'profile_pic': 'https://scontent-cdg2-1.xx.fbcdn.net/v/t31.0-8/20861578_1423520517733806_8548272624820887534_o.jpg?_nc_cat=100&ccb=2&_nc_sid=09cbfe&_nc_ohc=Tfut9RzlRKIAX_72twZ&_nc_ht=scontent-cdg2-1.xx&oh=f1ca399028f8243a593f2c43f3bad0f3&oe=5FE7D0A3', 'cover_pic': 'https://scontent-cdt1-1.xx.fbcdn.net/v/t31.0-8/27023978_1567681613317695_3096738409692127875_o.jpg?_nc_cat=103&ccb=2&_nc_sid=e3f864&_nc_ohc=Hto43dY645MAX8aXOd-&_nc_ht=scontent-cdt1-1.xx&oh=480aa2f028740963ebe9a51cb4cc103f&oe=5FE6AEEC'}
The tw profile -prasanth_gma user info is: {'name': 'Prasanth joe joseph'

Number of known faces: 1
Number of known faces: 1
Number of unknown faces: 6
[False, False]
[False, False, False, False]
[False, False, False, False, False, False]
[False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False]
[False, False, False, False, False, False, False, False, False, False, False, False]
Number of unknown faces: 1
[False, False, False, False, False, False, False, False, False, False, False, False, False, False]
The Profile pic match score is: 0 and probable face match score based on distance is: 0

----Getting Media Images-----
Downloading Image
Downloading Image
The fb Media pic path is  ./images/prasanth.anbalagan.9/media
----Clustering most common faces in media-----
Total number of faces  75
Clustered unique faces: 1
Number of images containing common face:  21
The media match score is: 2 and probable face match score based on distance is: 11.6


In [11]:
FB_PROFILE="rajees.afra"
TW_PROFILE='RajeesAhamed'
find_match(fbprofile=FB_PROFILE,twprofile=TW_PROFILE)

----Getting User Info-----
The fb profile -rajees.afra user info is: {'name': 'Rajees Ahamed', 'Intro': ['Owner- Sole Proprietorship, à Banu Photo Stores', 'Administrative Assistant, à Banu Matric Higher Secondary School', 'À son compteVolunteering, à ', 'BhumiA étudié à SRM Valliammai Engineering College', 'A étudié à Banu Matriculation Higher Secondary SchoolA étudié à Vetri Vikas Higher Secondary School', 'Habite à Ramanathapuram', 'De Ramanathapuram', 'CélibataireMembre depuis Octobre 2011'], 'friends': 0, 'profile_pic': 'https://scontent-cdg2-1.xx.fbcdn.net/v/t1.0-9/123105972_3165146093596971_5476727915243917650_o.jpg?_nc_cat=107&ccb=2&_nc_sid=09cbfe&_nc_ohc=6A5pH0xu5igAX_eWY7-&_nc_ht=scontent-cdg2-1.xx&oh=6d169e4da0540775fa9fdfc7a8775908&oe=5FE57CAF', 'cover_pic': 'https://scontent-cdg2-1.xx.fbcdn.net/v/t1.0-9/104590515_2811548562290061_5175184598871297942_n.jpg?_nc_cat=100&ccb=2&_nc_sid=e3f864&_nc_ohc=HL6ZZJQjbPYAX-2jcS5&_nc_ht=scontent-cdg2-1.xx&oh=e6fd4457e9a7eca4d0e210fb84f18

'--------Profile Match found : rajees.afra - RajeesAhamed-----------'

In [None]:
FB_PROFILE="fatemeh.sajadi.9"
TW_PROFILE='fsajadi'
find_match(fbprofile=FB_PROFILE,twprofile=TW_PROFILE)

