In [18]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [19]:
data = pd.read_csv("UpdatedResumeDataSet.csv")

In [20]:
data.dropna(inplace=True)

In [21]:
# Extract features and labels
X = data['Resume']
y = data['Category']

In [22]:
# Convert text data into numerical features using TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_transformed = vectorizer.fit_transform(X)

In [23]:
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=42)

In [24]:
# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)

In [25]:
# Train the modela
rf_classifier.fit(X_train, y_train)

In [26]:
# Make predictions
y_pred = rf_classifier.predict(X_test)

In [27]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred))

Accuracy: 99.48%

Classification Report:

                           precision    recall  f1-score   support

                 Advocate       1.00      1.00      1.00         3
                     Arts       1.00      1.00      1.00         6
       Automation Testing       0.83      1.00      0.91         5
               Blockchain       1.00      1.00      1.00         7
         Business Analyst       1.00      1.00      1.00         4
           Civil Engineer       1.00      1.00      1.00         9
             Data Science       1.00      1.00      1.00         5
                 Database       1.00      1.00      1.00         8
          DevOps Engineer       1.00      0.93      0.96        14
         DotNet Developer       1.00      1.00      1.00         5
            ETL Developer       1.00      1.00      1.00         7
   Electrical Engineering       1.00      1.00      1.00         6
                       HR       1.00      1.00      1.00        12
                   

In [32]:
# Predicting new data
new_resume = [
    "Skills * Programming Languages: Java, Spring Boot, Hibernate, SQL, JavaScript, REST APIs, HTML, CSS. \n" +
    "Experience: Developed multiple web applications using Java and Spring Boot. Implemented microservices architecture, created REST APIs, and optimized database queries. \n" +
    "Tools: IntelliJ IDEA, Maven, Git, Docker, Jenkins."
]
new_resume_transformed=vectorizer.transform(new_resume)
new_prediction = rf_classifier.predict(new_resume_transformed)
print("Predicted Category:", new_prediction[0])

ValueError: X has 33 features, but RandomForestClassifier is expecting 5000 features as input.

In [None]:
import joblib

# Save the trained Random Forest model
model = 'kaggle.pkl'
joblib.dump(rf_classifier, model)

# Save the TF-IDF vectorizer
vectorizer = 'tfidf_vectorizer.pkl'
joblib.dump(vectorizer, vectorizer)

print(f"Model saved as {model} and vectorizer saved as {vectorizer}.")
