In [1]:
import os
import docx2txt
import spacy
import pandas as pd
import re

nlp = spacy.load('en_core_web_sm')
matcher = spacy.matcher.Matcher(nlp.vocab)

def extract_name(text):
    nlp_text = nlp(text)

    pattern = [{'POS': 'PROPN'}, {'POS': 'PROPN'}, {'POS': 'PROPN'}]  # Updated pattern for three proper nouns
    pattern_single = [{'POS': 'PROPN'}]

    matcher.add('NAME', [pattern, pattern_single], on_match=None)

    matches = matcher(nlp_text)

    excluded_words = ['curriculum', 'vitae', 'resume', 'contact no:+91', 'relationship manager', 'premier acquisition',
                      'location preference', 'business development', 'credit card', 'customer relationship',
                      'key highlights', 'house no', 'branch sales', 'laxmi nagar', 'executive summary',
                      'educational qualification', 'school street', 'room no', 'client relationship', 'curriculam viate', 'professional synopsis', 'business analyst', 'self driven',
                      'business school', 'career objective', 'personal details','cv','(curriculum)','curriculam vitAe',
                      'date','declaration','. \n\n','hyderabad','mumbai','date', 'place', 'hindi', 'english', 'marathi',
                      'known', 'language', 'indian', 'nationality', 'status', 'marital', 'male', 'female', 'gender',
                      'age', 'october', 'birth','education','educationkushal','bio - data','curriculum – vitae','birth','mobile'
                      'icfai business school','photograph','name-','signature','bengali','photo awaited','photo','period',
                      'coimbatore','name','name:']

    full_name = ""
    for match_id, start, end in matches:
        span = nlp_text[start:end]
        span_text_lower = span.text.lower()

        if not any(word in span_text_lower for word in excluded_words):
            # Check if the starting word of the CV is the name
            if nlp_text[0].text.lower() not in excluded_words:
                tokens = [token.text for token in nlp_text[:end]]
                full_name = " ".join(tokens[:3])  # Join a maximum of three words
            else:
                tokens = [token.text for token in span]
                full_name = " ".join(tokens[:3])  # Join a maximum of three words

    return full_name.strip()  # Remove leading and trailing whitespace

def extract_emails(text):
    emails = re.findall(r'[\w\.-]+@[\w\.-]+', text)
    return emails

folder_path = "D:/Work-Assit/SOURCE"  # Replace with the actual folder path
file_list = os.listdir(folder_path)

results = []
for file_name in file_list:
    if file_name.endswith(".docx"):
        file_path = os.path.join(folder_path, file_name)
        text = docx2txt.process(file_path)
        name = extract_name(text)
        emails = extract_emails(text)
        results.append({"File Name": file_name, "Name": name, "Emails": emails})

df1 = pd.DataFrame(results)
df1.to_csv("output.csv", index=False)


In [3]:
df1

Unnamed: 0,File Name,Name,Emails
0,A JOSEPHIN STELLA-BM_Work Assist.docx,A JOSEPHIN STELLA,[deepikasbabu@gmail.com]
1,Aditi chauhan - Key account manager - 8 Yrs 0 ...,INEULE ZELIANG,[aditiZC@yahoo.in]
2,AkshayGurav - workassist.docx,Akshay Gurav,[gakshay2315@gmail.com]
3,Anand mishra - Select relationship manager - 7...,Mishra,[Email-armishra1628@gmail.com]
4,AnimeshMishra - workassist.docx,Animesh Mishra,[animeshroxxx14@gmail.com]
...,...,...,...
167,Workassist_MAHENDRANAM[7y_0m] (1) (1).docx,BIO - DATA,[mahendranmohanan@gmail.com]
168,Workassist_Prashant.docx,Prashant Yogi,"[prashantyogi19@gmail.com, prashantyogi19@gmai..."
169,Workassist_Ranjit mhase - Chief manager legal...,MR . RANJIT,[Ranjitmhase387@gmail.com]
170,Workassist_rupamjha[1y_6m].docx,JHA,[-rupamjhakol@gmail.com]
