# AICHAMP SCREENING TEST - Documented Code

## Submitted by - Sudarshan Paul
## Total Tasks Completed - 4
## Contact - 7908468882
## E-Mail - paulsudarshan98@gmail.com


# TASK-1
### Download 50 public profile PDFs of your connections (randomly) from LinkedIn.

In [420]:
# pip install --user pdfminer
# !pip install --user werkzeug

# Importing the required libraries.

In [1]:
from pdfminer.pdfinterp import PDFResourceManager, PDFPageInterpreter # Converting PDF to text
from pdfminer.converter import TextConverter  # utils for pdf conversion
from pdfminer.layout import LAParams  # utils for pdf conversion
from pdfminer.pdfpage import PDFPage  # utils for pdf conversion
from io import StringIO,BytesIO # utils for pdf conversion
import os

from flask import Flask # For FLASK API Development
from flask import Flask, flash, request, redirect, render_template # Utils for API development
from werkzeug.utils import secure_filename # function to secure a filename before storing it directly on the filesystem.
import urllib.request


import spacy  #Required for Profile Text Analysis ex : stop words removal, text cleaning etc
import pandas as pd #Required for structuring the data in the form of a DataFrame
import en_core_web_sm # This is the largest English Library for Spacy
import string #Required for string manipulation
import nltk # Also required for text analysis



# Function to Convert PDF ---> .txt

# Task -2
### # Extract text from the above PDFs and store them in a CSV.

In [4]:
def convertPDFToText(path):
    rsrcmgr = PDFResourceManager() # Create a PDF resource manager object that stores shared resources.
    retstr = StringIO() # Create instance of StringIO
    laparams = LAParams() # Set parameters for analysis.
    device = TextConverter(rsrcmgr, retstr, laparams=laparams)
    fp = open(path, 'rb') #Opening the PDF file in the respective path.
    interpreter = PDFPageInterpreter(rsrcmgr, device) #Creating an instance of PDFPageInterpreter
    password = "" #In case PDF is password protected
    maxpages = 0 
    caching = True
    pagenos=set()
    for page in PDFPage.get_pages(fp, pagenos, maxpages=maxpages, password=password,caching=caching, check_extractable=True):
        interpreter.process_page(page) 
    fp.close() # Closing the existing open file for security reasons.
    device.close()
    string = retstr.getvalue() #Obtaining all the text values from the PDF after parsing is over
    retstr.close() #Terminating the instance of StringIO
    return string # Returning the string values in plain text format

In [5]:
# convertPDFToText('C://Users//sudar//OneDrive//Desktop//Work Files//AI CHamp//Data//Profile (1).pdf')

# TASK-4 (Part a) and TASK-2 (Put together in single API)
### # The first web API should take a PDF file as input and return the text in it in JSON format.

# Creating the First WEB-API using FLASK.

## What is Flask?
### Flask is a micro web framework written in Python. It is classified as a microframework because it does not require particular tools or libraries.

## What works will this simple API establish?

### 1. Take the PDF file as input from the user.
### 2. Perform validation checks if the input file is as per mentioned format.
### 3. Save the PDF file in the desired location.
### 4. Call the function convertPDFToText() defined above and convert the PDF data to plain text format.
### 5. Once the data has been converted to plan text (string) then writing the same data to a file.txt ex- Profile_1.txt
### 6. Storing the multiple converted text files as a Pandas DataFrame and exporting the dataframe to CSV format and saving in the desired location.
### 7. Returning the PDF Profiles as .json format as a response from the Web API.

In [2]:
profile_df=pd.DataFrame() # Empty DataFrame to store the profile texts.

# Setting the FLASK application framework.
app = Flask(__name__)

# Location to save the uploaded files from the user.
UPLOAD_FOLDER = app.root_path+'\\Data\\uploads'

app.secret_key = "secret key" #value set for the SECRET_KEY configuration key
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER #configuring the app to store the uploaded files
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 #Limiting the size of file that the user upolads.

ALLOWED_EXTENSIONS = set(['pdf']) #Only PDF files are allowed to be as valid input

def allowed_file(filename): # Function to check if the uploaded file is valid or not.
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route('/')
def upload_form(): # Returns a simple template, provided to upload the files by the user.
    return render_template('upload.html')

@app.route('/', methods=['POST'])
def upload_file():
    global profile_df
    if request.method == 'POST':
        # check if the post request has the files part
        if 'files[]' not in request.files: 
            flash('No file part')
            return redirect(request.url)
        files = request.files.getlist('files[]')
        profile_dict={} # Empty dictionary to store profile details with respective profiles names.
        for file in files: # Looping over multiple files (if multiple files are uploaded by user)
            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename) 
                file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # saving the uploaded PDF file to desired location
                
                uploaded_file_path = os.path.join(UPLOAD_FOLDER+ '\\'+filename) # Path of stored PDF file
                stringFormat = convertPDFToText(uploaded_file_path) # Passing the path for the PDF file in the function to convert to plain text.
                
                # Writing the plain text values (string) obtained after parsing the pdf to external text files.
                with open(app.root_path+'\\Data\\uploads\\converted\\'+filename+'.txt', 'w', encoding='utf-8') as file_txt: 
                    file_txt.write(stringFormat) 
                    file_txt.close()
                
                profile_dict[filename]=stringFormat #storing the profile details as key-value pairs.
                
        # Creating a single DataFrame of all the profiles with their respective Profile Numbers as indexes.        
        profile_df = pd.DataFrame.from_dict(profile_dict,orient='index',columns=['Profiles'])
        
        # Exporting the created Profiles DataFrame to .csv 
        profile_df.to_csv(app.root_path+'\\Data\\uploads\\ProfileCSV\\ProfilesCSV.csv')
        
        flash('File(s) successfully uploaded')
        
        # Finally sending the response in .json format to the client (user).
        return profile_dict

if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [26/Aug/2020 21:13:31] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [26/Aug/2020 21:13:31] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


### Sample of the DataFrame containing Profile Details.

In [23]:
profile_df.head(20) 

Unnamed: 0,Profiles
Profile_1.pdf,\n\n \n\nContact\n9407608477 (Mobile)\npisdak...
Profile_2.pdf,\n\n \n\nContact\nabhishekvtiwari008@gmail.co...
Profile_3.pdf,\n\n \n\n \n\nContact\npkasture2010@gmail.com...
Profile_4.pdf,Contact\nroymilaniitd@gmail.com\n\nwww.linkedi...
Profile_5.pdf,Contact\ntherishabhmalhotra@gmail.co\nm\n\nwww...
Profile_6.pdf,\n\n \n\n \n\nAtul Joshi\n\nA.I Intern at Con...
Profile_7.pdf,\n\n \n\nContact\nVasantham 4/85 new number\n...
Profile_8.pdf,Contact\nharshithard05@gmail.com\n\nwww.linked...
Profile_9.pdf,Contact\nprateek@digitaldefynd.com\n\nwww.link...
Profile_10.pdf,\n\n \n\n \n\nContact\nakash.ravikumar12@gmai...


# TASK-3
### # For every profile data (text), find out the most frequent words and essential words used. It shouldn’t contain stop words (like is, the, an, etc.).

# Function : text_preprocessing()

## What does this function do?
### 1. Take in each word from various Profiles texts as input.
### 2. Perform checks if the particular word or character is a punctuation or unwanted symbols and removing them from the corpus. Since they carry very little or Nil information during text analytics.
### 3. Changing the case of all letters in a word to lowerCase this is done to avoid case sensitive issues which should be avoided during text analysis. For ex : boy , BoY, BOy, boY carry same meaning.
### 4. Removing stop-words from the corpus, which means those words which occur very frequently in a text corpus and removing them would not alter the inherent meaning of the text. This is done to achieve better performace from NLP models or easing the computational cost during pre-processing.
### 5. Performing Lemmatization of the words in order to reduce the different forms of the same word to the root word so as to return the base or dictionary form of a word, which is known as the lemma .


In [10]:

nlp = en_core_web_sm.load() # Loading the spacy English Language Model.

def text_preprocessing(word):
    try:
        rem_char = string.punctuation + string.digits # ALl the characters which must be omitted from the word corpus.

        mod_word = '' #Empty string to concatenate with the resultant strings.
        
        for char in word:
            if (char not in rem_char):
                mod_word += char.lower() # Lower Case all the letters of the word.

        docx = nlp(str(mod_word)) # Creating a Doc Object by tokenizing the text.

        if (len(mod_word) == 0 or docx[0].is_stop): # Removing the stopwords (if any)
            return None
        else:
            return docx[0].lemma_ # Perform Lemmatization of the words to reduce down to the root word or dictionary form.
    except:
        return None # to handle the odd case of characters like 'x02', etc.



# Function to clean the text-corpus and return the clean text

In [11]:
def clean_profile(profile_desc):    
    prc_description = ''
    for word in profile_desc.split():
        mod_word = text_preprocessing(word)
        if (mod_word is not None): 
            prc_description += (mod_word + ' ')
    return prc_description

In [12]:
clean_profile_df = profile_df.applymap(clean_profile) # Using .applymap() to apply the clean_profile function for each profile.

### Exporting the cleaned profile text corpus as excel 

In [24]:
# clean_profile_df.to_csv('C://Users//sudar//OneDrive//Desktop//Work Files//AI CHamp//Data//uploads//ProfileCSV//ProfilesCSV_Cleaned.csv')

In [None]:
clean_profile_df

# Function : pos_tag(s)
### - This function performs Parts of Speech Tagging of each words in a sentence.
### - This POS tagging is useful when we want to determine the most essential words in a corpus based on their use in the sentence.
### - This function returns the tag of a particular word based on which we can determine the relevancy of tha word in the corpus.

# Function : adj_Noun_words()
### 1. This function is used to determine the most relevant words in a text corpus which in case of a person's professional profile will be the Nouns (Name, Place, Object) and Adjectives. This helps to create a good impression in the recruiter's mind, use of good adjectives is really common for professional profiles like LinkedIN.

In [17]:
def pos_tag(s):
    return nltk.pos_tag(s) # Using nltks's pos_tag module to identify pos of each word in the corpus.

def adj_Noun_words(profile_desc):
    essential_list={}
    try:
        token = nltk.word_tokenize(profile_desc) #Tokenizing the text
    except:
        return None
    pos_token = pos_tag(token) # Obtaining the pos of the particular word.
    for i,tag in pos_token:
        # Checking if the word is of type : Noun or and Adjective.
        if tag in ["JJ","JJR","JJS","N"]:  # JJ, JJR, JJS , N denote Adjectives and Noun
            if i in essential_list: 
                essential_list[i]+=1
            else:
                essential_list[i] = 1
    return list(essential_list.keys()) # Returning the Essential Words

In [18]:
clean_profile_df['Essential Words'] = clean_profile_df.applymap(adj_Noun_words)

# Function : freq_words()
### - This function finds the most frequently occuring words in the text corpus for each profile. 
### - This helps us to analyse the individual's professional domain, linguistic behavior, professionalism and even past working experiences as well as job preferences.

In [19]:
def freq_words(profile_desc):
    corpus = list(profile_desc.split())
    word_count=dict()
    for word in corpus:
        if word not in word_count.keys():
            word_count[word] = corpus.count(word)
    sorted_word_count = sorted(word_count.items(), key = lambda kv:(kv[1], kv[0]), reverse=True)
    top_10_words = list() #List to contain top 10 most frequently occuring words in the corpus.
    for kv in sorted_word_count :
        top_10_words.append(kv)
        if len(top_10_words)==10:
            break
    return top_10_words

In [20]:
clean_profile_df['Top 10 Frequent Words'] = clean_profile_df['Profiles'].apply(freq_words)

In [21]:
clean_profile_df.head(20)

Unnamed: 0,Profiles,Essential Words,Top 10 Frequent Words
Profile_1.pdf,contact mobile pisdakgmailcom wwwlinkedincomin...,"[mobile, personal, hindi, english, fundamental...","[(raipur, 6), (startup, 5), (web, 4), (month, ..."
Profile_2.pdf,contact abhishekvtiwarigmailco m wwwlinkedinco...,"[microsoft, analytic, lean, yellow, summary, r...","[(business, 7), (analytic, 6), (management, 3)..."
Profile_3.pdf,contact pkasturegmailcom wwwlinkedincominpriya...,"[priyanka, india, social, spanish, elementary,...","[(india, 14), (community, 9), (learn, 8), (pun..."
Profile_4.pdf,contact roymilaniitdgmailcom wwwlinkedincominm...,"[digital, native, bilingual, english, tutorial...","[(delhi, 9), (month, 6), (technology, 5), (mod..."
Profile_5.pdf,contact therishabhmalhotragmailco m wwwlinkedi...,"[therishabhmalhotra, statistic, native, biling...","[(datum, 17), (⁣⁣, 16), (■, 6), (month, 6), (w..."
Profile_6.pdf,atul joshi ai intern continental automotive co...,"[intern, continental, automotive, artificial, ...","[(skill, 3), (neural, 3), (network, 3), (deep,..."
Profile_7.pdf,contact vasantham new number redfieldspuliakul...,"[new, redfieldspuliakulamappusamy, mobile, dee...","[(university, 17), (learn, 12), (science, 9), ..."
Profile_8.pdf,contact harshithardgmailcom wwwlinkedincomin h...,"[uiux, tamil, english, opencv, intern, ml, sum...","[(learn, 6), (research, 5), (science, 4), (int..."
Profile_9.pdf,contact prateekdigitaldefyndcom wwwlinkedincom...,"[social, personal, digital, present, new, delh...","[(digital, 8), (year, 7), (month, 7), (marketi..."
Profile_10.pdf,contact akashravikumargmailcom wwwlinkedincomi...,"[english, summary, uart, little, udp, datum, q...","[(work, 8), (hardware, 7), (learn, 5), (knowle..."


### Exporting the cleaned profile text corpus DataFrame with Essential and Most Frequently Occuring Words in each of the Profiles

In [22]:
# clean_profile_df.to_csv('C://Users//sudar//OneDrive//Desktop//Work Files//AI CHamp//Data//uploads//ProfileCSV//ProfilesCSV_FINAL.csv')

# TASK-4 (Part-b)
### # The second web API should take text data as input and return the most frequent words and important words (as mentioned in 3) in JSON format.

# Creating the Second WEB-API using FLASK.

## What work will this API establish?

### 1. Read the plain text input file of respective LinkedIn profiles which we converted previously using our very first WEB API.

### 2. Perform the cleaning or text-preprocessing of the each profile text corpus uploaded by the user.

### 3. Determine the Most-Essential words from each of the profile text corpus uploaded by the user and store them.

### 4. Find out the most frequently occuring words from the profile text corpus and store them.

### 5. As a final step, club all the results from multiple input files (or only single) and send as response to the client (user) in the form of .json format.

In [3]:


# Setting the FLASK application framework.
app = Flask(__name__)

# Location to save the uploaded files from the user.
UPLOAD_FOLDER = app.root_path+'\\Data\\uploads'

app.secret_key = "secret key" #value set for the SECRET_KEY configuration key
app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER #configuring the app to store the uploaded files
app.config['MAX_CONTENT_LENGTH'] = 16 * 1024 * 1024 #Limiting the size of file that the user upolads.


ALLOWED_EXTENSIONS = set(['txt']) #Only PDF files are allowed to be as valid input

def allowed_file(filename): # Function to check if the uploaded file is valid or not.
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

@app.route('/')
def upload_form():# Returns a simple template, provided to upload the files by the user.
    return render_template('upload.html')

@app.route('/', methods=['POST'])
def upload_file():
    global profile_df
    if request.method == 'POST':
        # check if the post request has the files part
        if 'files[]' not in request.files:
            flash('No file part')
            return redirect(request.url)
        files = request.files.getlist('files[]')
        profile_dict={} # Empty dictionary to store profile details with respective profiles names.
        NUM=0
        for file in files: # Looping over multiple files (if multiple files are uploaded by user)
            if file and allowed_file(file.filename):
                filename = secure_filename(file.filename)
                file.save(os.path.join(app.config['UPLOAD_FOLDER'], filename)) # saving the uploaded PDF file to desired location
                
                uploaded_file_path = os.path.join(UPLOAD_FOLDER+ '\\'+filename) # Path of stored .txt file
                with open(uploaded_file_path, 'r',encoding='ascii',errors='ignore') as file: # Reading the plain text input file
                    profile_txt = file.read()
                
                profile_txt = clean_profile(profile_txt) # Performing Text-Preprocessing of the profile text corpus
                Essential_Words = adj_Noun_words(profile_txt) # Returns the Most Essential WOrds
                Top_10_words = freq_words(profile_txt) # Returns the most frequently occuring words.
#                 print(profile_txt)
                NUM+=1
                profile_dict['PROFILE '+str(NUM)+' ESSENTIAL WORDS']= Essential_Words #Storing the results in a dict()
                profile_dict['PROFILE '+str(NUM)+' MOST FREQUENT WORDS']= Top_10_words #Storing frequent words in a dict()
        
        flash('File(s) successfully uploaded')
        return profile_dict # Returning the response a .json
            

if __name__ == "__main__":
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [26/Aug/2020 21:13:45] "[37mGET / HTTP/1.1[0m" 200 -
