In [None]:
import pandas as pd
import numpy as np
import openpyxl
from openpyxl import load_workbook
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, cohen_kappa_score, confusion_matrix, classification_report
from wordcloud import WordCloud
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Load the Excel File
INPUT_DATA_EXCEL_FILE_NAME = r"ResumeDataSet_BrandNew.xlsx"
WORKSHEET_NAME = r"ResumeDataSet"
data = pd.read_excel(INPUT_DATA_EXCEL_FILE_NAME, sheet_name=WORKSHEET_NAME)

# Data cleaning functions (you've already defined these earlier)
def preprocess_text(text):
    # Perform the text preprocessing steps you want (e.g., removing special characters, stopwords, etc.)
    text = text.lower()
    text = re.sub(r'[^\x00-\x7F]+', ' ', text)
    text = ' '.join([word for word in text.split() if word not in stopwords.words('english')])
    return text

# Apply text cleaning function to the 'Resume' column
data['CleanDescription'] = data['Resume'].apply(preprocess_text)

# If you want to verify the cleaned text
print(data[['Resume', 'CleanDescription']].head())


In [None]:
# Convert raw text into a matrix of features using TF-IDF vectorizer
tfidf = TfidfVectorizer(ngram_range=(2, 4))  # You can adjust n-grams as per your project
X = tfidf.fit_transform(data['CleanDescription'])
y = data['Category']  # Assuming the target column is 'Category'


In [None]:
# Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# Hyperparameter Tuning using RandomizedSearchCV for Logistic Regression
param_dist = {
    'C': np.linspace(0.01, 1, 100),
    'max_iter': [100, 200, 300, 400],
    'penalty': ['l2', 'l1']
}

random_search = RandomizedSearchCV(LogisticRegression(), param_distributions=param_dist, n_iter=100, cv=3, n_jobs=-1, random_state=42)
random_search.fit(X_train, y_train)

# Best Model
ModelLrg = random_search.best_estimator_

# Print Best Hyperparameters
print("Best params:", random_search.best_params_)


In [None]:
# Training the model
ModelLrg.fit(X_train, y_train)

# Predict the labels on test data
y_pred = ModelLrg.predict(X_test)

# Evaluate model performance
print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Cohen Kappa Score:", cohen_kappa_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


In [None]:
# Cross-validation using KFold
cv = KFold(n_splits=10, random_state=42, shuffle=True)
cv_scores = cross_val_score(ModelLrg, X, y, cv=cv, scoring='accuracy')
print("Cross-validation accuracy:", cv_scores.mean())


In [None]:
import pickle

# Save the model
with open("model.pkl", 'wb') as file:
    pickle.dump(ModelLrg, file)

# Load the model for future predictions
with open("model.pkl", 'rb') as file:
    loaded_model = pickle.load(file)


In [None]:
# Generate WordCloud for important words
text = " ".join(data['CleanDescription'])
wc = WordCloud(background_color='white', max_words=200).generate(text)

plt.figure(figsize=(10, 8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.show()


In [None]:
# Test the model on new text data
new_text = "Some resume content for prediction"
new_vector = tfidf.transform([new_text])
prediction = loaded_model.predict(new_vector)
print("Predicted Class:", prediction)
