In [1]:
# Step 1: Import Libraries
import pandas as pd

# Step 2: Load the dataset
df = pd.read_csv("../data/resumes/UpdatedResumeDataSet.csv", encoding='utf-8')

# Step 3: View basic info
df.head()


Unnamed: 0,Category,Resume
0,Data Science,Skills * Programming Languages: Python (pandas...
1,Data Science,Education Details \r\nMay 2013 to May 2017 B.E...
2,Data Science,"Areas of Interest Deep Learning, Control Syste..."
3,Data Science,Skills â¢ R â¢ Python â¢ SAP HANA â¢ Table...
4,Data Science,"Education Details \r\n MCA YMCAUST, Faridab..."


In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download resources if not already
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

# Initialize
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Function to clean text
def clean_text(text):
    # Lowercase
    text = text.lower()
    # Remove URLs and HTML
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    # Remove non-alphabetic characters
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    # Tokenize and remove stopwords
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    # Join back
    return ' '.join(tokens)

# Apply to the Resume column
df['Cleaned_Resume'] = df['Resume'].apply(clean_text)

# View cleaned text
df[['Category', 'Cleaned_Resume']].head()


[nltk_data] Downloading package stopwords to C:\Users\UMME
[nltk_data]     SALMA\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to C:\Users\UMME
[nltk_data]     SALMA\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to C:\Users\UMME
[nltk_data]     SALMA\AppData\Roaming\nltk_data...


Unnamed: 0,Category,Cleaned_Resume
0,Data Science,skill programming language python panda numpy ...
1,Data Science,education detail may may b e uit rgpv data sci...
2,Data Science,area interest deep learning control system des...
3,Data Science,skill r python sap hana tableau sap hana sql s...
4,Data Science,education detail mca ymcaust faridabad haryana...


In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Step 1: Create the vectorizer
tfidf = TfidfVectorizer(stop_words='english', max_features=1000)

# Step 2: Fit and transform the 'Resume' column
X = tfidf.fit_transform(df['Resume'])

# Output shape
print("TF-IDF Matrix Shape:", X.shape)


TF-IDF Matrix Shape: (962, 1000)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Step 1: Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, df['Category'], test_size=0.2, random_state=42)

# Step 2: Create and train the Naive Bayes classifier
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 3: Make predictions
y_pred = model.predict(X_test)

# Step 4: Evaluate
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Accuracy: 0.9637305699481865
Classification Report:
                            precision    recall  f1-score   support

                 Advocate       1.00      0.33      0.50         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       1.00      0.40      0.57         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      0.92      0.96        12
        