In [26]:
import pandas as pd

# Load the uploaded file to inspect its structure
file_path = (r'C:\Users\birdc\Desktop\merged.csv')
data = pd.read_csv(file_path)

# Display the first few rows to understand its structure
data.head(), data.columns


(                          Name                       Contact Email  \
 0           Rakesh Gupta Kotha  rakesh.guptakotha.20cse@bmu.edu.in   
 1               Rakshit Sharma         rakshit.sharma606@gmail.com   
 2  Ravindhranadh Thakkellapati         ravindhranadh1969@gmail.com   
 3                Razina Khanam            razinakhanam06@gmail.com   
 4              Rishi Raj Singh              rishiraj2954@gmail.com   
 
      Phone Number                                   LinkedIn Profile  \
 0      9391137589  https://www.linkedin.com/in/rakesh-gupta-831a8...   
 1      7065847740  http://www.linkedin.com/in/rakshit-sharma-176a...   
 2      7288918840  https://www.linkedin.com/in/ravindhranadh-thak...   
 3      7893080627                                                NaN   
 4  +91 7014975299                                                NaN   
 
                          GitHub Profile Job Title (1st Experience)  \
 0       https://github.com/rakeshguptak              WEB DEV

In [27]:
# Combine the skill-related columns into a single column
data["Skills - Combined"] = data["Skills - Proficient"].fillna("") + " " + \
                            data["Skills - Experienced"].fillna("") + " " + \
                            data["Skills - Familiar"].fillna("")


In [28]:
from nltk.tokenize.punkt import PunktSentenceTokenizer

def preprocess_text_nltk(text):
    if pd.isnull(text):
        return ""
    # Tokenize using PunktSentenceTokenizer
    tokenizer = PunktSentenceTokenizer()
    tokens = tokenizer.tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)


In [29]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import pandas as pd

# Download NLTK data (run only once)
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize lemmatizer and stopword set
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

# Preprocessing function using NLTK
def preprocess_text_nltk(text):
    if pd.isnull(text):
        return ""
    # Tokenize, remove stopwords, and lemmatize
    tokens = word_tokenize(text.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    return " ".join(tokens)

# Apply preprocessing to student skills
data["Processed Skills"] = data["Skills - Combined"].apply(preprocess_text_nltk)

# Function to recommend students based on job description
def recommend_students_nltk(job_description, top_n=5):
    # Preprocess job description
    job_description = preprocess_text_nltk(job_description)
    
    # Combine job description with student data
    combined_data = data["Processed Skills"].tolist()
    combined_data.insert(0, job_description)  # Add job description as the first entry
    
    # Vectorize data using TF-IDF
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(combined_data)
    
    # Compute cosine similarity
    similarity_scores = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:]).flatten()
    
    # Get top N students
    top_indices = np.argsort(similarity_scores)[::-1][:top_n]
    recommendations = data.iloc[top_indices].copy()
    recommendations["Similarity Score"] = similarity_scores[top_indices]
    
    return recommendations[["Name", "Contact Email", "Skills - Combined", "Similarity Score"]]

# Example usage: input a job description
job_desc = "Looking for a software developer skilled in Python, Java, and SQL with experience in web development."
recommended_students_nltk = recommend_students_nltk(job_desc)
recommended_students_nltk


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\birdc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\birdc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\birdc\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0,Name,Contact Email,Skills - Combined,Similarity Score
20,Suvarsha Venumbaka,Suvarsha.venumbaka@gmail.com,C++ • SQL • Java • Python • CSS • HTML Python ...,0.131513
37,Vineet,vineet.20cse@bmu.edu.in,"C, Python, CSS, HTML Python, C++ C, Java, SQL",0.129556
28,Utsav Sharma,utsav.dhariwal@gmail.com,"C++, Java, Python, SQL, HTML, CSS C++, Python R",0.129556
52,Khushi Gupta,khushigupta27kg@gmail.com,"Java, Python, C, SQL, JavaScript, HTML, CSS Ja...",0.12822
39,Yakkala Sri Praneeth,yakkala.sripraneeth.20cse@bmu.edu.in,"Java, Python, SQL, HTML, CSS Python, Java, C J...",0.12822
