In [1]:
import pandas as pd
import os.path as Path
import numpy as np
import string as str
import unicodedata

In [142]:
def first_non_null(series):
    '''
    Utility function to get the first non-null value
    '''
    return series.dropna().iloc[0] if not series.dropna().empty else None


def process_name(name):
    '''
    Utility function to process names
    '''
    # replace NaN with an empty string
    if pd.isna(name):
        return ''
    # strip dots and spaces and get only the longest/first word
    words = name.lower().strip(" .").split()

    # Initialize variables to keep track of the longest word
    longest_word = ''
    longest_length = 0

    # Iterate through each word
    for word in words:
        # If the current word is longer than the longest word found so far, or if it's equal in length but appears earlier
        if len(word) > longest_length or (len(word) == longest_length and words.index(word) < words.index(longest_word)):
            longest_word = word
            longest_length = len(word)

    # Normalize the string to decomposed Unicode form
    normalized_s = unicodedata.normalize('NFD', longest_word)

    # Remove non-spacing marks (special characters)
    stripped_s = ''.join(c for c in normalized_s if unicodedata.category(c) != 'Mn')

    return stripped_s




In [179]:
data_events_ppl = pd.read_csv(Path.join("..", "raw_data", "raw_all.csv"))
data_events_ppl

Unnamed: 0,Event,First Name,Surname,Email,Attendee Status,Your Job Position,Choose your role,Choose your role.1,Seniority,How did you hear from us?,Company,Rain,Day
0,2,Patrick,Beeker,patrick.beeker@gmail.com,Attending,Product,Product Manager,,,BPM Meetup group,Smartbroker AG,Yes,Tuesday
1,2,Teena,Kumari,teena2768@gmail.com,Attending,Product,Product Manager,,,LinkedIn,World simplifed Ug,Yes,Tuesday
2,2,Nesha,Siwi,i@neshasiwi.de,Attending,Design,,,,LinkedIn,Freelancer,Yes,Tuesday
3,2,Shuvam kumar,Sah,shuvamshah98152@gmail.com,Attending,Student,,,,BPM Meetup group,ATEM,Yes,Tuesday
4,2,Ayman,Chalhoub,aymanc@gmail.com,Attending,Founder / CXO,,,,Lenny's Newsletter,Clink,Yes,Tuesday
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1199,7,Ira,Sheveleva,hiirinasheveleva@gmail.com,Checked In,Marketing,,Product Marketing,,Word of Mouth / Referral,Sticked,No,Tuesday
1200,7,Dominik,Wenzel,Dominik.wenzel@outlook.com,Checked In,Product,Product Manager,,,Handpicked Berlin,OpenToWork,No,Tuesday
1201,7,Marta,Osowiecka,m.osowiecka.m@gmail.com,Checked In,Design,,,,Word of Mouth / Referral,SWAP,No,Tuesday
1202,7,akshay,rote,akshay.rote@gmail.com,Not Attending,,,,,,PAIR Finance,No,Tuesday


In [180]:
data_scraped = pd.read_csv(Path.join("..", "raw_data", "raw_scrapped.csv"))
data_scraped

Unnamed: 0,url,title,linkedinProfileUrl,email,linkedinProfile,description,headline,location,firstName,lastName,...,subscribers,mutualConnectionsText,imgUrl,website,mail,connectedOn,error,phoneNumber,partialScreenshot,facebookUrl
0,https://linkedin.com/in/patrick-beeker-4673311,Patrick Beeker,https://www.linkedin.com/in/patrick-beeker-467...,,https://www.linkedin.com/in/patrick-beeker-467...,Experienced professional with the ability to c...,"General Manager - US Finance, Legal, HR, Accou...","Carrollton, Texas, United States",Patrick,Beeker,...,,,,,,,,,,
1,https://linkedin.com/in/teena-kumari-0b8359271,Teena Kumari – Tech product support intern – H...,https://www.linkedin.com/in/teena-kumari-0b835...,,https://www.linkedin.com/in/teena-kumari-0b835...,"I am a Product Owner at World Simplified UG, a...",Tech product support intern at Holidu,"Berlin, Berlin, Germany",Teena,Kumari,...,1148.0,"Carlo Cantarini, Thomas Hartmann, and 28 other...",,,,,,,,
2,https://linkedin.com/in/mischagawronski,Mischa Gawronski – Creative Director – Freelance,https://www.linkedin.com/in/mischagawronski/,,https://www.linkedin.com/in/mischagawronski/,Passionate and challenging influencer in strat...,IGP Network | Creative Director,"Berlin, Berlin, Germany",Mischa,Gawronski,...,149.0,,https://media.licdn.com/dms/image/C5603AQHb9Mw...,,,,,,,
3,https://linkedin.com/in/shuvam-kumar-sah-88590...,Shuvam Kumar Sah – Web Designer,https://www.linkedin.com/in/shuvam-kumar-sah-8...,,https://www.linkedin.com/in/shuvam-kumar-sah-8...,Designer & Problem Solver | CS Student | Aimin...,Making Interfaces Sexier - UI/UX/Dev,"Berlin, Berlin, Germany",Shuvam Kumar,Sah,...,467.0,🤖 Leon Meier and Nisha Kumari are mutual conne...,https://media.licdn.com/dms/image/D4E03AQFcJ48...,dribbble.com/shuvam_sah,,,,,,
4,https://linkedin.com/in/chalhoub,Ayman Chalhoub - Clink,https://www.linkedin.com/in/chalhoub/,,https://www.linkedin.com/in/chalhoub/,,Product & Growth,"Berlin, Berlin, Germany",Ayman,Chalhoub,...,1975.0,"Torben Schulz, Sonya Pobiedimska, and 2 other ...",,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,https://linkedin.com/in/alikhanmba,"Ali Akbar Khan, MBA - Delivery Hero",https://www.linkedin.com/in/alikhanmba/,,https://www.linkedin.com/in/alikhanmba/,"Experienced CX and E-Commerce professional, wi...",Global Customer Experience,"Berlin, Berlin, Germany",Ali Akbar,"Khan, MBA",...,,"Nick Mulder, Matteo Fava, and 1 other mutual c...",https://media.licdn.com/dms/image/C4E03AQHwf7j...,,,,,,,
677,https://linkedin.com/in/daniellopezc,Daniel López Contreras - Masterschool,https://www.linkedin.com/in/daniellopezc/,,https://www.linkedin.com/in/daniellopezc/,- 100% Customer oriented professional\n- Stron...,"Transforming Education, Customer Focused, avid...","Berlin, Berlin, Germany",Daniel,López Contreras,...,3160.0,"Marie NEANG, Fabian Tröltzsch, and 39 other mu...",https://media.licdn.com/dms/image/C4E03AQF_30k...,,,,,,,
678,https://linkedin.com/in/rim-bensouf-link,Rim Bensouf - Product Owner - SPORTTOTAL ...,https://www.linkedin.com/in/rim-bensouf-link/,,https://www.linkedin.com/in/rim-bensouf-link/,,Product Owner at Sporttotal.tv\n\nAgile leader...,Berlin Metropolitan Area,Rim,Bensouf,...,153.0,Maurice Dücker is a mutual connection,https://media.licdn.com/dms/image/D4E03AQH2OhG...,,,,,,,
679,https://linkedin.com/in/oguzcelikada,"Oguz Celikada – Berlin, Berlin, Deutschland",https://www.linkedin.com/in/oguzcelikada/,,https://www.linkedin.com/in/oguzcelikada/,Experienced professional in human-centric busi...,Digital Product Consultant,"Berlin, Berlin, Germany",Oguz,Celikada,...,711.0,"Tim Herbig, Mikhail Grinberg, and 4 other mutu...",https://media.licdn.com/dms/image/C4D03AQGbpHK...,,,,,,,


In [6]:
data_events_ppl.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1204 entries, 0 to 1203
Data columns (total 13 columns):
 #   Column                     Non-Null Count  Dtype 
---  ------                     --------------  ----- 
 0   Event                      1204 non-null   int64 
 1   First Name                 1204 non-null   object
 2   Surname                    1204 non-null   object
 3   Email                      1204 non-null   object
 4   Attendee Status            1204 non-null   object
 5   Your Job Position          706 non-null    object
 6   Choose your role           477 non-null    object
 7   Choose your role.1         85 non-null     object
 8   Seniority                  62 non-null     object
 9   How did you hear from us?  766 non-null    object
 10  Company                    1127 non-null   object
 11  Rain                       1204 non-null   object
 12  Day                        1204 non-null   object
dtypes: int64(1), object(12)
memory usage: 122.4+ KB


In [181]:
data_events_ppl["First Name"] = data_events_ppl["First Name"].apply(process_name)
data_events_ppl["Surname"] = data_events_ppl["Surname"].apply(process_name)
data_events_ppl["fullName"] = data_events_ppl["First Name"] + ' ' + data_events_ppl["Surname"]

data_events_ppl['Email'] = data_events_ppl['Email'].str.lower()
data_events_ppl['Company'] = data_events_ppl["Company"].apply(process_name)

# Create DF with unique IDs------------------------------

# Define custom function to get the first non-null value
def first_non_null(series):
    return series.dropna().iloc[0] if not series.dropna().empty else np.nan

# Columns where you want to apply the first_non_null function
columns_to_agg = ['First Name',
                  'Surname',
                  'fullName',
                  'Company',
                  'Your Job Position',
                  'Choose your role',
                  'Choose your role.1',
                  'Seniority'
                  ]



In [182]:
data_events_ppl.sample(20)

Unnamed: 0,Event,First Name,Surname,Email,Attendee Status,Your Job Position,Choose your role,Choose your role.1,Seniority,How did you hear from us?,Company,Rain,Day,fullName
472,3,miji,lee,miji.lee.kr@hotmail.com,Checked In,Data Analytics,,,,LinkedIn,lieferando,Yes,Thursday,miji lee
755,5,esther,vidal,hello@markos-esther.com,Checked In,Design,,,,BPM Meetup group,markos-esther,No,Thursday,esther vidal
1169,7,elena,t,tuzhikova.el@gmail.com,Checked In,Product,Product Manager,,,Lenny's Newsletter,freelance,No,Tuesday,elena t
373,3,elisa,song,elisa.shanghai@gmail.com,Attending,Data Analytics,,,,FOMO Newsletter,freelancer,Yes,Thursday,elisa song
674,5,lara,orrico,orricolara@gmail.com,Not Attending,,,,,,product,No,Thursday,lara orrico
399,3,catarina,pinto,catarinapp24@gmail.com,Attending,Product,,Product Manager,,Word of Mouth / Referral,tesla,Yes,Thursday,catarina pinto
524,4,balaji,alagu,balaji.alagu@kenjo.io,Not Attending,,,,,,kenjo,No,Tuesday,balaji alagu
527,4,janet,mao,janet.w.mao@gmail.com,Not Attending,,,,,,blinkist,No,Tuesday,janet mao
910,6,vitali,kotik,talkotick@gmail.com,Not Attending,,,,,,bolt,Yes,Thursday,vitali kotik
191,2,lilika,schulte-ostermann,lilikamagdalena@gmail.com,Checked In,Design,,,,BPM Meetup group,d-labs,Yes,Tuesday,lilika schulte-ostermann


In [183]:
# Create a dictionary to specify aggregation functions for each column
agg_dict = {'Event': 'count'}
agg_dict.update({column: first_non_null for column in columns_to_agg})

# Group by 'fullName' and apply aggregation
unique_attendees = data_events_ppl.groupby(by='Email', as_index=False).agg(agg_dict)
unique_attendees["numEvents"] = unique_attendees.Event

In [184]:

unique_attendees

Unnamed: 0,Email,Event,First Name,Surname,fullName,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,numEvents
0,01hub_gilders@icloud.com,2,marcin,belinski,marcin belinski,insurance,Product,Product Manager,,,2
1,1cmarcosrubio@gmail.com,1,marcos,rubio,marcos rubio,tucuvi,,,,,1
2,4rz27gdtzp@privaterelay.appleid.com,3,marcin,belinski,marcin belinski,insurance,Product,Product Manager,Product Manager,,3
3,4srk9xfkmw@privaterelay.appleid.com,3,mikhail,grinberg,mikhail grinberg,looking,Product,Product Manager,Product Manager,,3
4,9gfz8fnsm4@privaterelay.appleid.com,1,myles,sutholt,myles sutholt,talent.io,Product,Head of Product,,,1
...,...,...,...,...,...,...,...,...,...,...,...
813,zahrarouhi@gmail.com,1,zahra,rouhi,zahra rouhi,yumlister,Design,,,,1
814,zajczik@gmail.com,1,ola,zajac,ola zajac,zalando,Product,Product Manager,,,1
815,zaron.arava@gmail.com,1,zaron,arava,zaron arava,xxx,Other,,,,1
816,zimmermannjennifer43@gmail.com,1,zimmermann,jennifer,zimmermann jennifer,gsgsg,Other,,,,1


In [185]:
# Index in this table is now the user ID
unique_attendees['UserID'] = unique_attendees.index

# we have numEvents now instead
unique_attendees = unique_attendees.drop(labels='Event',axis=1)
unique_attendees["Company"] = unique_attendees["Company"].apply(process_name)

In [186]:
# Replace suspicious and empty strings in the Company name with NaNs
to_replace_list = ['','none','xxx','-','tbd','123','n','na','x','--','looking','gsgsg']
unique_attendees['Company'].replace(to_replace_list, np.nan, inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  unique_attendees['Company'].replace(to_replace_list, np.nan, inplace=True)


In [187]:
unique_attendees

Unnamed: 0,Email,First Name,Surname,fullName,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,numEvents,UserID
0,01hub_gilders@icloud.com,marcin,belinski,marcin belinski,insurance,Product,Product Manager,,,2,0
1,1cmarcosrubio@gmail.com,marcos,rubio,marcos rubio,tucuvi,,,,,1,1
2,4rz27gdtzp@privaterelay.appleid.com,marcin,belinski,marcin belinski,insurance,Product,Product Manager,Product Manager,,3,2
3,4srk9xfkmw@privaterelay.appleid.com,mikhail,grinberg,mikhail grinberg,,Product,Product Manager,Product Manager,,3,3
4,9gfz8fnsm4@privaterelay.appleid.com,myles,sutholt,myles sutholt,talent.io,Product,Head of Product,,,1,4
...,...,...,...,...,...,...,...,...,...,...,...
813,zahrarouhi@gmail.com,zahra,rouhi,zahra rouhi,yumlister,Design,,,,1,813
814,zajczik@gmail.com,ola,zajac,ola zajac,zalando,Product,Product Manager,,,1,814
815,zaron.arava@gmail.com,zaron,arava,zaron arava,,Other,,,,1,815
816,zimmermannjennifer43@gmail.com,zimmermann,jennifer,zimmermann jennifer,,Other,,,,1,816


In [188]:
# Try to aggregate one more time according to the full name - in case the person used different emails
agg_dict = {'numEvents': 'sum'}

# Columns where you want to apply the first_non_null function
columns_to_agg2 = ['First Name',
                  'Surname',
                  'Email',
                  'Company',
                  'Your Job Position',
                  'Choose your role',
                  'Choose your role.1',
                  'Seniority'
                  ]
agg_dict.update({column: first_non_null for column in columns_to_agg2})

# Group by 'fullName' and apply aggregation
unique_attendees2 = unique_attendees.groupby(by='fullName', as_index=False).agg(agg_dict)
unique_attendees2 

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority
0,abdelrahman elfar,2,abdelrahman,elfar,abdelrahman.alfar@gmail.com,sap,Engineering,Product Manager,,
1,abdo wahba,1,abdo,wahba,abdelrahman.wahba@gmail.com,,,,,
2,abhi kasoju,1,abhi,kasoju,kasoju.abhinaya@gmail.com,care,Data Analytics,,,
3,abhishek agarwal,1,abhishek,agarwal,aggyabhishek@gmail.com,appreciate,Founder / CXO,,,
4,abhishek khatri,1,abhishek,khatri,abhishekkhatri@hotmail.com,zalando,Other,,,
...,...,...,...,...,...,...,...,...,...,...
776,zimmermann jennifer,1,zimmermann,jennifer,zimmermannjennifer43@gmail.com,,Other,,,
777,ziyad mohiyudheen,2,ziyad,mohiyudheen,mmziyad@gmail.com,,,,,
778,zsolt pap,1,zsolt,pap,papjanoszsolt@yahoo.com,axkro,,eCommerce Manager,,
779,zsolt szrapko,1,zsolt,szrapko,szrapko.zsolt+eventbrite@gmail.com,goto,Product,Product Manager,,


In [189]:
unique_attendees2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 10 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   fullName            781 non-null    object
 1   numEvents           781 non-null    int64 
 2   First Name          781 non-null    object
 3   Surname             781 non-null    object
 4   Email               781 non-null    object
 5   Company             705 non-null    object
 6   Your Job Position   523 non-null    object
 7   Choose your role    364 non-null    object
 8   Choose your role.1  77 non-null     object
 9   Seniority           55 non-null     object
dtypes: int64(1), object(9)
memory usage: 61.1+ KB


In [190]:
# Index in this table is now the user ID
unique_attendees2['UserID'] = unique_attendees2.index

In [191]:
data_scraped.dropna(subset=['url'], inplace=True)
data_scraped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 668 entries, 0 to 680
Data columns (total 69 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   url                        668 non-null    object 
 1   title                      668 non-null    object 
 2   linkedinProfileUrl         617 non-null    object 
 3   email                      172 non-null    object 
 4   linkedinProfile            617 non-null    object 
 5   description                469 non-null    object 
 6   headline                   656 non-null    object 
 7   location                   656 non-null    object 
 8   firstName                  656 non-null    object 
 9   lastName                   656 non-null    object 
 10  fullName                   656 non-null    object 
 11  connectionDegree           656 non-null    object 
 12  vmid                       656 non-null    object 
 13  userId                     656 non-null    float64
 14 

In [192]:
data_scraped = data_scraped[data_scraped['error'].isnull()]
data_scraped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 666 entries, 0 to 680
Data columns (total 69 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   url                        666 non-null    object 
 1   title                      666 non-null    object 
 2   linkedinProfileUrl         615 non-null    object 
 3   email                      172 non-null    object 
 4   linkedinProfile            615 non-null    object 
 5   description                469 non-null    object 
 6   headline                   656 non-null    object 
 7   location                   656 non-null    object 
 8   firstName                  656 non-null    object 
 9   lastName                   656 non-null    object 
 10  fullName                   656 non-null    object 
 11  connectionDegree           656 non-null    object 
 12  vmid                       656 non-null    object 
 13  userId                     656 non-null    float64
 14 

In [193]:
# Define a function to replace NaN values in fullName
def replace_nan_fullname(full_name, title):
    
    if pd.isna(full_name):
        print(title)
        # Split the title by whitespace
        words = title.split()

        # Check if there are only 2 words in the title
        if len(words) == 2:
            return ' '.join(words[:2])

        # Check for ',' or '-' signs and return the words before them
        for char in [',', '-']:
            if char in title:
                return title.split(char)[0]

    return full_name

# Apply the function to replace NaN values in the fullName column
data_scraped['fullName'] = data_scraped.apply(lambda row: replace_nan_fullname(row['fullName'], row['title']), axis=1)
data_scraped.info()

Adriano Ruiz - tripunkt GmbH
Yanyan Ma – MBA Candidate – ESMT Berlin
Armin Krahl - thermondo
Veronica Sartore - Project coordinator - Yad be Yad (Kubus ...
Katharina Andres - Nosto
Siena Chan - cargo.one
Fernanda Pugliero - Löwenzahn Organics GmbH
Lynn Ng – Operations Assistant – Ginza Berlin
Jessica He - HelloFresh
Lara Orrico - Just Eat Takeaway.com
<class 'pandas.core.frame.DataFrame'>
Index: 666 entries, 0 to 680
Data columns (total 69 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   url                        666 non-null    object 
 1   title                      666 non-null    object 
 2   linkedinProfileUrl         615 non-null    object 
 3   email                      172 non-null    object 
 4   linkedinProfile            615 non-null    object 
 5   description                469 non-null    object 
 6   headline                   656 non-null    object 
 7   location                   656 non-nu

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['fullName'] = data_scraped.apply(lambda row: replace_nan_fullname(row['fullName'], row['title']), axis=1)


In [213]:
def replace_nan_fullname(full_name, title):
    if pd.isna(full_name):
        # Split the title by whitespace
        print(title)
        words = title.split()

        # Check if there are only 2 words in the title
        if len(words) == 2:
            return ' '.join(words[:2])

        # Check for ',' or '-' signs and return the words before the first occurrence
        for char in [',', '-','–']:
            if char in title:
                return title.split(char, 1)[0]

    return full_name
data_scraped['fullName'] = data_scraped.apply(lambda row: replace_nan_fullname(row['fullName'], row['title']), axis=1)

yanyan ma – mba candidate – esmt berlin
lynn ng – operations assistant – ginza berlin


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['fullName'] = data_scraped.apply(lambda row: replace_nan_fullname(row['fullName'], row['title']), axis=1)


In [214]:
data_scraped.info()


<class 'pandas.core.frame.DataFrame'>
Index: 666 entries, 0 to 680
Data columns (total 70 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   url                        666 non-null    object 
 1   title                      666 non-null    object 
 2   linkedinProfileUrl         615 non-null    object 
 3   email                      172 non-null    object 
 4   linkedinProfile            615 non-null    object 
 5   description                469 non-null    object 
 6   headline                   656 non-null    object 
 7   location                   656 non-null    object 
 8   firstName                  656 non-null    object 
 9   lastName                   656 non-null    object 
 10  fullName                   666 non-null    object 
 11  connectionDegree           656 non-null    object 
 12  vmid                       656 non-null    object 
 13  userId                     656 non-null    float64
 14 

In [73]:
# Clean data_scraped
data_scraped.fullName = data_scraped.fullName.str.lower()
data_scraped['email'] = data_scraped['email'].fillna('').str.lower()
data_scraped['company'] = data_scraped['company'].fillna('').str.lower().str.strip()

data_scraped['UserID'] = float('nan')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped.fullName = data_scraped.fullName.str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['email'] = data_scraped['email'].fillna('').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['company'] = data_scraped['company'].fillna('').str.lower().str.s

In [58]:
unique_attendees2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   fullName            781 non-null    object
 1   numEvents           781 non-null    int64 
 2   First Name          781 non-null    object
 3   Surname             781 non-null    object
 4   Email               781 non-null    object
 5   Company             781 non-null    object
 6   Your Job Position   781 non-null    object
 7   Choose your role    781 non-null    object
 8   Choose your role.1  781 non-null    object
 9   Seniority           55 non-null     object
 10  UserID              781 non-null    int64 
dtypes: int64(2), object(9)
memory usage: 67.2+ KB


In [57]:
unique_attendees2.Company = unique_attendees.Company.apply(process_name)
unique_attendees2['Your Job Position'] = unique_attendees2['Your Job Position'].apply(process_name)
unique_attendees2['Choose your role'] = unique_attendees2['Choose your role'].apply(process_name)
unique_attendees2['Choose your role.1'] = unique_attendees2['Choose your role.1'].apply(process_name)


In [216]:
data_scraped.fullName

0                Patrick Beeker
1                  Teena Kumari
2              Mischa Gawronski
3              Shuvam Kumar Sah
4                Ayman Chalhoub
                 ...           
676         Ali Akbar Khan, MBA
677      Daniel López Contreras
678                 Rim Bensouf
679               Oguz Celikada
680    Althaf Ali Mattummathodi
Name: fullName, Length: 666, dtype: object

In [195]:
data_scraped['title'] = data_scraped['title'].str.lower()
data_scraped['title']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['title'] = data_scraped['title'].str.lower()


0                                         patrick beeker
1      teena kumari – tech product support intern – h...
2       mischa gawronski – creative director – freelance
3                        shuvam kumar sah – web designer
4                                 ayman chalhoub - clink
                             ...                        
676                  ali akbar khan, mba - delivery hero
677                daniel lópez contreras - masterschool
678         rim bensouf - product owner - sporttotal ...
679          oguz celikada – berlin, berlin, deutschland
680                althaf ali mattummathodi - schaeffler
Name: title, Length: 666, dtype: object

In [74]:
data_scraped['Attendance'] = 0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['Attendance'] = 0


In [196]:
data_scraped.firstName = data_scraped['firstName'].str.lower()
data_scraped.lastName = data_scraped['lastName'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped.firstName = data_scraped['firstName'].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped.lastName = data_scraped['lastName'].str.lower()


In [None]:

username = row_ids['Email'].split("@")[0]
# Match by username in the email
if len(username) != 0:
    if (username in row_scraped['fullName']) or (username in row_scraped['lastName']) or (username in row_scraped['email']):
    # if row_ids['fullName'].contains(username, na=False) | row_ids['Surname'].contains(username, na=False) | row_ids['Email'].contains(username, na=False):
        return True
    
username = username.split(".")[1]
if len(username) != 0:
    if (username in row_scraped['fullName']) or (username in row_scraped['lastName']) or (username in row_scraped['email']):
    # if row_ids['fullName'].contains(username, na=False) | row_ids['Surname'].contains(username, na=False) | row_ids['Email'].contains(username, na=False):
        return True

In [272]:
def custom_user_matching(row_scraped, row_ids):
    '''
    Utility function to match scraped data with UserIDs
    '''
    if row_scraped['fullName'] == row_ids['fullName']:
        print(f"Full name matched {row_ids['fullName']} for user {row_ids['UserID']}")
        return True

    # Compare the emails
    if row_scraped['email'] == row_ids['Email'] :
        return True

    # Match by only last name
    # print(row_scraped['fullName'], row_ids['fullName'])
    last_name = row_scraped['fullName'].split()[-1]
    if len(last_name) > 3:
        if ( last_name in row_ids['fullName']) or (last_name in row_ids['Company']) or (last_name in row_ids['Email']):
        # if row_ids['fullName'].contains(row_scraped['lastName'], na=False) | row_ids['Company'].contains( row_scraped['lastName'], na=False)|row_ids['Email'].contains(row_scraped['lastName'], na=False):
            print(f"Last name from full name matched {last_name} {row_ids['fullName']} for user {row_ids['UserID']}")
            return True  
    last_name = row_scraped['lastName']
    if (not pd.isna(last_name)) and (len(last_name) > 3):
        if ( last_name in row_ids['fullName']) or (last_name in row_ids['Company']) or (last_name in row_ids['Email']):
        # if row_ids['fullName'].contains(row_scraped['lastName'], na=False) | row_ids['Company'].contains( row_scraped['lastName'], na=False)|row_ids['Email'].contains(row_scraped['lastName'], na=False):
            print(f"Last name matched {last_name} {row_ids['fullName']} for user {row_ids['UserID']}")
            return True 

        # Compare first or last name match AND the company match
        #if row_ids['Surname'] in row_scraped['fullName']:
         #   if (row_ids['Company'] in row_scraped['company'].lower()) and ( len(row_ids['Company']) != 0):
          #      return True
           # if (row_ids['Your Job Position'] in row_scraped['title'].lower()) and ( len(row_ids['Your Job Position']) != 0):
       #         return True
       #     if (row_ids['Choose your role'] in row_scraped['title'].lower()) and ( len(row_ids['Choose your role']) != 0):
       #         return True
       #     if (row_ids['Choose your role.1'] in row_scraped['title'].lower()) and ( len(row_ids['Choose your role.1']) != 0):
        #        return True
    if row_ids['First Name'] in row_scraped['fullName']:
        if not pd.isna(row_scraped['company']):
            if (len(row_ids['Company']) != 0) and (row_ids['Company'] in row_scraped['company'].lower()):
                return True
        if (len(row_ids['Your Job Position']) != 0) and (row_ids['Your Job Position'] in row_scraped['title'].lower()) :
            print(f"Your Job Position matched!!! {row_ids['fullName']} {row_scraped['fullName']} for user {row_ids['UserID']}")
            return True
        if (len(row_ids['Choose your role']) != 0) and (row_ids['Choose your role'] in row_scraped['title'].lower()):
            print(f"Choose your role matched!!! {row_ids['fullName']} {row_scraped['fullName']} for user {row_ids['UserID']}")
            return True
        if ( len(row_ids['Choose your role.1']) != 0) and (row_ids['Choose your role.1'] in row_scraped['title'].lower()):
            print(f"Choose your role.1 matched!!! {row_ids['fullName']} {row_scraped['fullName']} for user {row_ids['UserID']}")
            return True
        #print(f"First name matched, but not found :( {row_ids['First Name']} {row_scraped['fullName']}")
    return False

In [218]:
data_scraped.fullName = data_scraped.fullName.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped.fullName = data_scraped.fullName.str.lower()


In [237]:
data_scraped.fullName = data_scraped.fullName.apply(process_name_not_removing)

patrick
['patrick']
beeker
['patrick', 'beeker']
teena
['teena']
kumari
['teena', 'kumari']
mischa
['mischa']
gawronski
['mischa', 'gawronski']
shuvam
['shuvam']
kumar
['shuvam', 'kumar']
sah
['shuvam', 'kumar', 'sah']
ayman
['ayman']
chalhoub
['ayman', 'chalhoub']
elena
['elena']
berendeeva
['elena', 'berendeeva']
mircea
['mircea']
giurca
['mircea', 'giurca']
luísa
['lusa']
suda
['lusa', 'suda']
sidra
['sidra']
aziz
['sidra', 'aziz']
lisa-sophie
['lisasophie']
rätze
['lisasophie', 'rtze']
salvatorina
['salvatorina']
rassu
['salvatorina', 'rassu']
nikita
['nikita']
zavyalov
['nikita', 'zavyalov']
natalia
['natalia']
pánshina
['natalia', 'pnshina']
yuri
['yuri']
orthey
['yuri', 'orthey']
carolina
['carolina']
torres
['carolina', 'torres']
arzamendi
['carolina', 'torres', 'arzamendi']
marcin
['marcin']
fabianczyk
['marcin', 'fabianczyk']
dilan
['dilan']
gevsek
['dilan', 'gevsek']
elisabeta
['elisabeta']
g
['elisabeta', 'g']
mritunjay
['mritunjay']
kumar
['mritunjay', 'kumar']
verma
['mri

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped.fullName = data_scraped.fullName.apply(process_name_not_removing)


In [256]:
data_scraped['lastName'] = data_scraped['lastName'].fillna('').str.lower()
data_scraped.lastName = data_scraped.lastName.apply(process_name_not_removing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['lastName'] = data_scraped['lastName'].fillna('').str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped.lastName = data_scraped.lastName.apply(process_name_not_removing)


In [228]:
import re # REGEX
def basic_cleaning(sentence):

    # 1. Removing whitespaces
    sentence = sentence.strip()

    #5 remove non-alpha characters
    sentence = re.sub(r"[^a-zA-Z]", '', sentence)

    # 2. Lowercasing
    sentence = sentence.lower()

    # 3. Removing numbers
    sentence = ''.join(char for char in sentence if not char.isdigit())

    # remove tags
    sentence = re.sub('<[^<]+>', "", sentence)

    # 4. Removing punctuation
    for punctuation in str.punctuation:
        sentence = sentence.replace(punctuation, '')

    return sentence

In [238]:
def process_name_not_removing(name):
    '''
    Utility function to process names
    '''''
    words = name.lower().strip(" .").split()

    processed_name = []
    # Iterate through each word
    for word in words:
        # Normalize the string to decomposed Unicode form
        word = basic_cleaning(word)
        normalized_s = unicodedata.normalize('NFD', word)        
        # Remove non-spacing marks (special characters)
        stripped_s = ''.join(c for c in normalized_s if unicodedata.category(c) != 'Mn')
        processed_name.append(stripped_s)
    return ' '.join(processed_name)

In [236]:
process_name_not_removing("fásdfg Npoih%7bx😆")

fásdfg
['fsdfg']
npoih%7bx😆
['fsdfg', 'npoihbx']


'fsdfg npoihbx'

In [246]:
unique_attendees2['Company'] = unique_attendees2[['Company']].fillna('')

In [260]:
unique_attendees2.fullName = unique_attendees2.fullName.apply(process_name_not_removing)

In [247]:
unique_attendees2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   fullName            781 non-null    object
 1   numEvents           781 non-null    int64 
 2   First Name          781 non-null    object
 3   Surname             781 non-null    object
 4   Email               781 non-null    object
 5   Company             781 non-null    object
 6   Your Job Position   523 non-null    object
 7   Choose your role    364 non-null    object
 8   Choose your role.1  77 non-null     object
 9   Seniority           55 non-null     object
 10  UserID              781 non-null    int64 
dtypes: int64(2), object(9)
memory usage: 67.2+ KB


In [268]:
data_scraped['UserID'] = float('nan')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_scraped['UserID'] = float('nan')


In [274]:
for j, row_ids in unique_attendees2.iterrows():
    # Iterate through each row in the scraped DataFrame
    found = False
    for index_scraped, row_scraped in data_scraped.iterrows():
        # Check if the rows match using the custom matching function
        if custom_user_matching(row_scraped, row_ids):
            # If a match is found, add the userID from the ids DataFrame to the scraped DataFrame
            data_scraped.at[index_scraped, 'UserID'] = row_ids['UserID']
            # data_scraped.at[index_scraped, 'Attendance'] = row_ids['Attendee Status']
            found = True
            break

    if found == False:
        print(f"Not found userID {row_ids['UserID']}  {row_ids['fullName']} {row_ids['Email']}")

Full name matched abdelrahman elfar for user 0
Last name from full name matched wahba abdo wahba for user 1
Not found userID 2  abhi kasoju kasoju.abhinaya@gmail.com
Not found userID 3  abhishek agarwal aggyabhishek@gmail.com
Full name matched abhishek khatri for user 4
Full name matched abhishek wabale for user 5
Full name matched abolfazl jafarzadehpour for user 6
Not found userID 7  adithya krishnamurthy pkadithya90@gmail.com
Not found userID 8  aditya bhirangi aditya_atul.bhirangi@edu.escp.eu
Full name matched aditya konarde for user 9
Full name matched aditya raghunath for user 10
Full name matched adriano ruiz for user 11
Full name matched adva bendov for user 12
Full name matched agnel joby for user 13
Not found userID 14  ahilan sundaralingam ahilan@be-light.app
Last name from full name matched ahmad ahmad saeed for user 15
Last name from full name matched ahmed ahmed ragab for user 16
Full name matched aino kivinen for user 17
Not found userID 18  akhrorova karomat tarkankarom

In [271]:

# Iterate through each row in the ids DataFrame
for j, row_ids in unique_attendees2.iterrows():
    # Iterate through each row in the scraped DataFrame
    found = False
    for index_scraped, row_scraped in data_scraped.iterrows():
        # Check if the rows match using the custom matching function
        if custom_user_matching(row_scraped, row_ids):
            # If a match is found, add the userID from the ids DataFrame to the scraped DataFrame
            data_scraped.at[index_scraped, 'UserID'] = row_ids['UserID']
            # data_scraped.at[index_scraped, 'Attendance'] = row_ids['Attendee Status']
            found = True
            break
    if found == False:
 #       print(row_ids["fullName"], row_ids["Email"])

Full name matched abdelrahman elfar
Last name from full name matched wahba abdo wahba
First name matched, but not found :( abhi abhishek wabale
First name matched, but not found :( abhi abhishek khatri
First name matched, but not found :( abhi abhishek khanna
First name matched, but not found :( abhi abhinav paul
First name matched, but not found :( abhishek abhishek wabale
First name matched, but not found :( abhishek abhishek khatri
First name matched, but not found :( abhishek abhishek khanna
First name matched, but not found :( abhishek abhishek wabale
Full name matched abhishek khatri
Full name matched abhishek wabale
Full name matched abolfazl jafarzadehpour
First name matched, but not found :( aditya aditya raghunath
First name matched, but not found :( aditya aditya konarde
First name matched, but not found :( aditya aditya varma penmetsa
First name matched, but not found :( aditya aditya raghunath
Full name matched aditya konarde
Full name matched aditya raghunath
Full name ma

In [267]:
data_scraped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 666 entries, 0 to 680
Data columns (total 70 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   url                        666 non-null    object 
 1   title                      666 non-null    object 
 2   linkedinProfileUrl         615 non-null    object 
 3   email                      172 non-null    object 
 4   linkedinProfile            615 non-null    object 
 5   description                469 non-null    object 
 6   headline                   656 non-null    object 
 7   location                   656 non-null    object 
 8   firstName                  656 non-null    object 
 9   lastName                   666 non-null    object 
 10  fullName                   666 non-null    object 
 11  connectionDegree           656 non-null    object 
 12  vmid                       656 non-null    object 
 13  userId                     656 non-null    float64
 14 

In [259]:
unique_attendees2['Your Job Position'] = unique_attendees2['Your Job Position'].fillna('').str.lower()
unique_attendees2['Choose your role'] = unique_attendees2['Choose your role'].fillna('').str.lower()
unique_attendees2['Choose your role.1'] = unique_attendees2['Choose your role.1'].fillna('').str.lower()
unique_attendees2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 781 entries, 0 to 780
Data columns (total 11 columns):
 #   Column              Non-Null Count  Dtype 
---  ------              --------------  ----- 
 0   fullName            781 non-null    object
 1   numEvents           781 non-null    int64 
 2   First Name          781 non-null    object
 3   Surname             781 non-null    object
 4   Email               781 non-null    object
 5   Company             781 non-null    object
 6   Your Job Position   781 non-null    object
 7   Choose your role    781 non-null    object
 8   Choose your role.1  781 non-null    object
 9   Seniority           55 non-null     object
 10  UserID              781 non-null    int64 
dtypes: int64(2), object(9)
memory usage: 67.2+ KB


In [174]:
data_scraped.info()

<class 'pandas.core.frame.DataFrame'>
Index: 666 entries, 0 to 680
Data columns (total 71 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   url                        666 non-null    object 
 1   title                      666 non-null    object 
 2   linkedinProfileUrl         615 non-null    object 
 3   email                      666 non-null    object 
 4   linkedinProfile            615 non-null    object 
 5   description                469 non-null    object 
 6   headline                   656 non-null    object 
 7   location                   656 non-null    object 
 8   firstName                  656 non-null    object 
 9   lastName                   656 non-null    object 
 10  fullName                   666 non-null    object 
 11  connectionDegree           656 non-null    object 
 12  vmid                       656 non-null    object 
 13  userId                     656 non-null    float64
 14 

In [269]:
data_scraped_non_matched = data_scraped[data_scraped.UserID.isnull()]
data_scraped_non_matched = data_scraped_non_matched.drop(columns=['url','linkedinProfileUrl','linkedinProfile','facebookUrl','error','partialScreenshot','phoneNumber','connectedOn',
                                       'imgUrl','profileId','timestamp','schoolUrl','mutualConnectionsText','baseUrl','endorsement6','endorsement5',
                                      'endorsement4','endorsement3','endorsement2','endorsement1','birthday','mutualConnectionsUrl','connectionsUrl',
                                       'connectionsCount','vmid','userId','linkedinSalesNavigatorUrl','connectionDegree','subscribers','UserID'])

In [114]:
unique_attendees2

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,UserID
0,abdelrahman elfar,2,abdelrahman,elfar,abdelrahman.alfar@gmail.com,insurance,engineering,product,,,0
1,abdo wahba,1,abdo,wahba,abdelrahman.wahba@gmail.com,tucuvi,,,,,1
2,abhi kasoju,1,abhi,kasoju,kasoju.abhinaya@gmail.com,insurance,analytics,,,,2
3,abhishek agarwal,1,abhishek,agarwal,aggyabhishek@gmail.com,,founder,,,,3
4,abhishek khatri,1,abhishek,khatri,abhishekkhatri@hotmail.com,talent.io,other,,,,4
...,...,...,...,...,...,...,...,...,...,...,...
776,zimmermann jennifer,1,zimmermann,jennifer,zimmermannjennifer43@gmail.com,mediamarktsaturn,other,,,,776
777,ziyad mohiyudheen,2,ziyad,mohiyudheen,mmziyad@gmail.com,spiced,,,,,777
778,zsolt pap,1,zsolt,pap,papjanoszsolt@yahoo.com,verkstedt,,ecommerce,,,778
779,zsolt szrapko,1,zsolt,szrapko,szrapko.zsolt+eventbrite@gmail.com,academy,product,product,,,779


In [270]:
data_scraped_non_matched

Unnamed: 0,title,email,description,headline,location,firstName,lastName,fullName,company,companyUrl,...,schoolDateRange2,allSkills,skill1,skill2,skill3,skill4,skill5,skill6,website,mail
0,patrick beeker,,Experienced professional with the ability to c...,"General Manager - US Finance, Legal, HR, Accou...","Carrollton, Texas, United States",patrick,beeker,patrick beeker,Orbis Systems,https://www.linkedin.com/company/25018031/,...,1996 - 2001,"Accounting, Management, Strategic Planning, Fo...",Accounting,Management,Strategic Planning,Forecasting,Organizational Development,Business Strategy,,
1,teena kumari – tech product support intern – h...,,"I am a Product Owner at World Simplified UG, a...",Tech product support intern at Holidu,"Berlin, Berlin, Germany",teena,kumari,teena kumari,Holidu,https://www.linkedin.com/company/5257651/,...,Dec 2014 - Jun 2020,"Market Research, Product Strategies, Team Lead...",Market Research,Product Strategies,Team Leadership,Cross-functional Team Leadership,Project Planning,Business Development,,
2,mischa gawronski – creative director – freelance,,Passionate and challenging influencer in strat...,IGP Network | Creative Director,"Berlin, Berlin, Germany",mischa,gawronski,mischa gawronski,Freelance,,...,,"Strategische Planung, Soziale Medien, Visuelle...",Strategische Planung,Soziale Medien,Visuelle Kommunikation,Fotografie,Business Development,Critical Thinking,,
3,shuvam kumar sah – web designer,,Designer & Problem Solver | CS Student | Aimin...,Making Interfaces Sexier - UI/UX/Dev,"Berlin, Berlin, Germany",shuvam kumar,sah,shuvam kumar sah,A.T.E.M. Management GmbH,https://www.linkedin.com/company/75730132/,...,Aug 2021 - Jun 2022,"React.js, Java, JavaScript, Cascading Style Sh...",React.js,Java,JavaScript,Cascading Style Sheets (CSS),Front-End Development,Wireframing,dribbble.com/shuvam_sah,
4,ayman chalhoub - clink,,,Product & Growth,"Berlin, Berlin, Germany",ayman,chalhoub,ayman chalhoub,Clink,https://www.linkedin.com/company/93780977/,...,,"Product Marketing, Start-ups, Cross-functional...",Product Marketing,Start-ups,Cross-functional Team Leadership,Business Development,Mobile Applications,Mobile Technology,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
676,"ali akbar khan, mba - delivery hero",,"Experienced CX and E-Commerce professional, wi...",Global Customer Experience,"Berlin, Berlin, Germany",ali akbar,khan mba,ali akbar khan mba,Delivery Hero,https://www.linkedin.com/company/2393200/,...,2009 - 2012,"Banking, Financial Analysis, Corporate Finance...",Banking,Financial Analysis,Corporate Finance,Portfolio Management,Risk Management,Investment Banking,,
677,daniel lópez contreras - masterschool,,- 100% Customer oriented professional\n- Stron...,"Transforming Education, Customer Focused, avid...","Berlin, Berlin, Germany",daniel,lpez contreras,daniel lpez contreras,Masterschool,https://www.linkedin.com/company/75721942/,...,2001 - 2007,"Public Speaking, Presentation Skills, Customer...",Public Speaking,Presentation Skills,Customer Relationship Management (CRM),Organization Skills,Account Management,Analytical Skills,,
678,rim bensouf - product owner - sporttotal ...,,,Product Owner at Sporttotal.tv\n\nAgile leader...,Berlin Metropolitan Area,rim,bensouf,rim bensouf,SPORTTOTAL TECHNOLOGY GmbH,https://www.linkedin.com/company/66650149/,...,Sep 2009 - Oct 2011,"Business-to-Business (B2B), B2C, Over-the-Top ...",Business-to-Business (B2B),B2C,Over-the-Top Content (OTT),Sport stream,Jira,Agile Leadership,,
679,"oguz celikada – berlin, berlin, deutschland",,Experienced professional in human-centric busi...,Digital Product Consultant,"Berlin, Berlin, Germany",oguz,celikada,oguz celikada,Freelance,,...,2008 - 2011,"Human Computer Interaction, User Research, Qua...",Human Computer Interaction,User Research,Qualitative Research,UX Research,User Experience (UX),Ethnography,,


In [110]:
unique_attendees2

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,UserID
0,abdelrahman elfar,2,abdelrahman,elfar,abdelrahman.alfar@gmail.com,insurance,engineering,product,,,0
1,abdo wahba,1,abdo,wahba,abdelrahman.wahba@gmail.com,tucuvi,,,,,1
2,abhi kasoju,1,abhi,kasoju,kasoju.abhinaya@gmail.com,insurance,analytics,,,,2
3,abhishek agarwal,1,abhishek,agarwal,aggyabhishek@gmail.com,,founder,,,,3
4,abhishek khatri,1,abhishek,khatri,abhishekkhatri@hotmail.com,talent.io,other,,,,4
...,...,...,...,...,...,...,...,...,...,...,...
776,zimmermann jennifer,1,zimmermann,jennifer,zimmermannjennifer43@gmail.com,mediamarktsaturn,other,,,,776
777,ziyad mohiyudheen,2,ziyad,mohiyudheen,mmziyad@gmail.com,spiced,,,,,777
778,zsolt pap,1,zsolt,pap,papjanoszsolt@yahoo.com,verkstedt,,ecommerce,,,778
779,zsolt szrapko,1,zsolt,szrapko,szrapko.zsolt+eventbrite@gmail.com,academy,product,product,,,779


In [119]:
target_string = 'bartholomae' 
unique_attendees2[unique_attendees2['fullName'].str.contains(target_string, na=False) | 
          unique_attendees2['Surname'].str.contains( target_string, na=False)| 
                  unique_attendees2['Email'].str.contains(target_string, na=False)]

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,UserID
168,daniel bartholomae,1,daniel,bartholomae,daniel.bartholomae@optilyz.com,purpose,founder,,,CxO,168


In [None]:
target_string = 'bartholomae'
unique_attendees2[unique_attendees2['fullName'].str.contains(target_string, na=False) | 
          unique_attendees2['Surname'].str.contains( target_string, na=False)| 
                  unique_attendees2['Email'].str.contains(target_string, na=False)]

In [121]:
target_string = 'bartholomae'
data_scraped[data_scraped['lastName'].str.contains(target_string, na=False)]

Unnamed: 0,url,title,linkedinProfileUrl,email,linkedinProfile,description,headline,location,firstName,lastName,...,imgUrl,website,mail,connectedOn,error,phoneNumber,partialScreenshot,facebookUrl,UserID,Attendance
112,https://linkedin.com/in/bartholomae,💻 daniel bartholomae - optilyz,https://www.linkedin.com/in/bartholomae/,daniel@bartholomae.name,https://www.linkedin.com/in/bartholomae/,"If you want to connect, please add a note. I'm...","Entrepreneur, founder and tech enthusiast","Berlin, Berlin, Germany",👨‍💻 daniel,bartholomae,...,https://media.licdn.com/dms/image/C4D03AQFqKHF...,startup-cto.net,daniel@bartholomae.name,"May 11, 2023",,+49 30 20847337,,,,0


In [124]:
target_string = 'olx'
unique_attendees2[unique_attendees2['fullName'].str.contains(target_string, na=False) | 
          unique_attendees2['Company'].str.contains( target_string, na=False)| 
                  unique_attendees2['Email'].str.contains(target_string, na=False)]

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,UserID
442,martin klein,1,martin,klein,martin.klein@funktel.com,olx,other,,,,442
696,tatiana pridchenko,1,tatiana,pridchenko,tapridchenko@gmail.com,olx,student,,,,696


In [128]:
target_string = 'abhinav'
unique_attendees2[unique_attendees2['fullName'].str.contains(target_string, na=False) | 
          unique_attendees2['Company'].str.contains( target_string, na=False)| 
                  unique_attendees2['Email'].str.contains(target_string, na=False)]

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,UserID


In [131]:
target_string = 'miranda' #matched by only last name
filtered_df[filtered_df['fullName'].str.contains(target_string, na=False) | 
          filtered_df['Surname'].str.contains( target_string, na=False)| 
                  filtered_df['Email'].str.contains(target_string, na=False)]

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,UserID,_merge
55,ana miranda,1,ana,miranda,apaulafernandesmiranda@gmail.com,akirolabs,product,product,,,55,left_only


In [None]:
'miranda' in filtered_df['fullName']

In [170]:
filtered_df.loc[55,'fullName']

'ana miranda'

In [171]:
'miranda' in filtered_df.loc[55,'fullName']

True

In [115]:
# Assuming unique_attendees2 is your DataFrame containing unique attendees with column UserID,
# and data_scraped is your DataFrame containing scraped data with column UserID

# Perform left join
merged_df = unique_attendees2.merge(data_scraped[['UserID']], on='UserID', how='left', indicator=True)

# Filter out rows where UserID from data_scraped is not null
filtered_df = merged_df[merged_df['_merge'] == 'left_only']


In [116]:
filtered_df

Unnamed: 0,fullName,numEvents,First Name,Surname,Email,Company,Your Job Position,Choose your role,Choose your role.1,Seniority,UserID,_merge
1,abdo wahba,1,abdo,wahba,abdelrahman.wahba@gmail.com,tucuvi,,,,,1,left_only
2,abhi kasoju,1,abhi,kasoju,kasoju.abhinaya@gmail.com,insurance,analytics,,,,2,left_only
3,abhishek agarwal,1,abhishek,agarwal,aggyabhishek@gmail.com,,founder,,,,3,left_only
7,adithya krishnamurthy,2,adithya,krishnamurthy,pkadithya90@gmail.com,insurance,product,lead,,,7,left_only
8,aditya bhirangi,1,aditya,bhirangi,aditya_atul.bhirangi@edu.escp.eu,memasik,,,,,8,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...
773,zahra rouhi,1,zahra,rouhi,zahrarouhi@gmail.com,varshalalwani94@gmail.com,design,,,,773,left_only
774,zaron arava,1,zaron,arava,zaron.arava@gmail.com,fintech,other,,,,774,left_only
776,zimmermann jennifer,1,zimmermann,jennifer,zimmermannjennifer43@gmail.com,mediamarktsaturn,other,,,,776,left_only
777,ziyad mohiyudheen,2,ziyad,mohiyudheen,mmziyad@gmail.com,spiced,,,,,777,left_only


In [117]:
data_scraped_non_matched.to_csv('../raw_data/data_scraped_non_matched.csv')

In [118]:
filtered_df.to_csv('../raw_data/ids_non_matched.csv')

In [108]:
strin = "sdfssecretghth"

In [109]:
strin.str.contains("secret")

AttributeError: 'str' object has no attribute 'str'