In [None]:
import pandas as pd
import re
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import classification_report, confusion_matrix  # For evaluating model performance
import seaborn as sns  # For data visualization



from sklearn.model_selection import train_test_split  # For splitting data into training and testing sets
from sklearn.preprocessing import LabelEncoder, OneHotEncoder  # For encoding categorical variables


from sklearn.linear_model import LogisticRegression  # For logistic regression
from sklearn.tree import DecisionTreeClassifier  # For decision tree classification
from sklearn.ensemble import RandomForestClassifier  # For random forest classification

import pickle


In [None]:
user_p_df = pd.read_csv('User_profile_data_updated_Previous_job.csv')
user_p_df.drop(columns=['Comments'], inplace=True, axis=1)
user_p_df.head()

In [None]:
df_name = pd.read_csv('User_profile_data_dummy_values.csv')
df_name.head()

In [None]:
user_p_df.columns = ['Clean_Current Role', 'Clean_About me', 'Clean_Education',
       'Clean_Years', 'Clean_Skills', 'Clean_Experience', 'TEXT',
       'Notice Period', 'Expected CTC', 'Offered Location', 'Offered Salary',
       'Current Salary', 'Current Location']

In [None]:
user_p_df['Name'] = df_name['Name']

In [None]:
user_p_df.info()

In [None]:
user_p_df.info()

In [None]:
def extract_salary(string):
    if not isinstance(string, str):
        return string
    res = []
    ranges = string.split(' - ')
    ranges = [val.replace(',','').replace('₹','') for val in ranges]
    regex = re.compile(r'\d+')
    for rng in ranges:
        matches = regex.findall(rng)
        for m in matches:
            val = float(m)            
            if val < 100.0:
                val = val*100000
            res.append(val)
    return np.mean(res)

In [None]:
user_p_df['Current Salary'] = user_p_df['Current Salary'].apply(extract_salary)
user_p_df['Current Salary'].fillna(np.mean(user_p_df['Current Salary']), inplace = True)

user_p_df['Offered Salary'] = user_p_df['Offered Salary'].apply(extract_salary)
user_p_df['Offered Salary'].fillna(np.mean(user_p_df['Offered Salary']), inplace = True)

In [None]:
def create_label(row):
    notice_period = row['Notice Period']
    offered_salary = row['Offered Salary']
    offered_location = row['Offered Location']
    current_salary = row['Current Salary']
    current_location = row['Current Location']
    
    # We might consider these factors:
    # A) A significant salary raise is a strong motivation to change jobs, even if the notice period is long.
    # B) If the offered location is the same as the current location, it's more likely the person will accept.
    # C) Even if the offered location is different, if the salary raise is significant, the person might still accept.
    
    salary_raise = offered_salary - current_salary
    same_location = offered_location == current_location
    
    # Logic to generate labels:
    # If the salary raise is more than 20% of the current salary, and the notice period is less than or equal to 60 days, or the location is the same, we consider it likely that the person will join.
    if (salary_raise >= 0.2 * current_salary and notice_period <= 60) or same_location:
        label = 1  # Joined
    else:
        label = 0  # Not joined
    
    return label


In [None]:
user_p_df['label'] = user_p_df.apply(create_label, axis=1)
user_p_df.head()

In [None]:
user_p_df['Current Location'] = user_p_df['Current Location'].fillna('India').apply(lambda x: 'Noida' if x.strip() == 'Nodia' else x.strip())
user_p_df['Offered Location'] = user_p_df['Offered Location'].fillna('India').apply(lambda x: 'Noida' if x.strip() == 'Nodia' else x.strip())

In [None]:
user_p_df.to_csv('Employee_data.csv')

In [None]:
user_p_df.isna().sum()

In [None]:
user_p_df['Offered Location'].value_counts()

In [None]:
user_p_df['Current Location'].value_counts()

In [None]:
set(pd.concat([user_p_df['Current Location'], user_p_df['Offered Location']]))

In [None]:
# Create a LabelEncoder object
le = LabelEncoder()

# Use the LabelEncoder object to transform the Class column of principal_components_df DataFrame
le.fit(pd.concat([user_p_df['Current Location'], user_p_df['Offered Location']]))
user_p_df['Current Location'] = le.transform(user_p_df['Current Location'])
user_p_df['Offered Location'] = le.transform(user_p_df['Offered Location'])

# Display the first few rows of the transformed DataFrame
user_p_df.head()

with open('models/label_encoder', 'wb') as file:
    pickle.dump(le, file)



In [None]:
final_data = user_p_df[['Notice Period', 'Expected CTC', 'Offered Location', 'Offered Salary',
       'Current Salary', 'Current Location', 'label']]

In [None]:
X = final_data.drop('label', axis = 1)
y = final_data['label']

In [None]:
from sklearn.utils import resample

# Assuming you have your features 'X' and labels 'y', and that your minority class has been labelled as '1'.
X_concat = pd.concat([X, y], axis=1)

# Separating majority and minority classes
majority_class = X_concat[y==1]
minority_class = X_concat[y==0]

# Upsampling minority class
minority_upsampled = resample(minority_class,
                              replace=True, # sample with replacement
                              n_samples=len(majority_class), # match number in majority class
                              random_state=27) # reproducible results

# Combine majority class with upsampled minority class
upsampled = pd.concat([majority_class, minority_upsampled])

# Split your data again into X and y
y_upsampled = upsampled[y.name]
X_upsampled = upsampled.drop(y.name, axis=1)


In [None]:
y_upsampled.value_counts()

In [None]:
# Split the data into training and testing sets using train_test_split function
X_train, X_test, y_train, y_test = train_test_split(X_upsampled, y_upsampled, test_size=0.2, random_state=42)

In [None]:
from sklearn.model_selection import GridSearchCV  # For hyperparameter tuning using GridSearchCV

# Logistic Regression hyperparameter tuning
lr_params = {  # Define hyperparameters for Logistic Regression
    'C': [0.001, 0.01, 0.1, 1, 10, 100],
    'penalty': ['l1', 'l2'],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'saga']
}

lr_grid = GridSearchCV(LogisticRegression(max_iter=1000), lr_params, cv=5, n_jobs=-1)  # Create a GridSearchCV object for Logistic Regression
lr_grid.fit(X_train, y_train)  # Fit the GridSearchCV object to the training data
print("Best parameters for Logistic Regression:", lr_grid.best_params_)  # Print the best hyperparameters for Logistic Regression
print(f"Best score for Logistic Regression: {lr_grid.best_score_:.4f}")  # Print the best score for Logistic Regression

# Decision Tree hyperparameter tuning
dt_params = {  # Define hyperparameters for Decision Tree
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

dt_grid = GridSearchCV(DecisionTreeClassifier(), dt_params, cv=5, n_jobs=-1)  # Create a GridSearchCV object for Decision Tree
dt_grid.fit(X_train, y_train)  # Fit the GridSearchCV object to the training data
print("Best parameters for Decision Tree:", dt_grid.best_params_)  # Print the best hyperparameters for Decision Tree
print(f"Best score for Decision Tree: {dt_grid.best_score_:.4f}")  # Print the best score for Decision Tree

# Random Forest hyperparameter tuning
rf_params = {  # Define hyperparameters for Random Forest
    'n_estimators': [10, 50, 100, 200],
    'criterion': ['gini', 'entropy'],
    'max_depth': [None, 5, 10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

rf_grid = GridSearchCV(RandomForestClassifier(), rf_params, cv=5, n_jobs=-1)  # Create a GridSearchCV object for Random Forest
rf_grid.fit(X_train, y_train)  # Fit the GridSearchCV object to the training data
print("Best parameters for Random Forest:", rf_grid.best_params_)  # Print the best hyperparameters for Random Forest
print(f"Best score for Random Forest: {rf_grid.best_score_:.4f}")  # Print the best score for Random Forest



In [None]:
# Evaluate models and save them
models_pred = {'Logistic Regression': lr_grid.predict(X_test), 'Decision Tree': dt_grid.predict(X_test), 'Random Forest': rf_grid.predict(X_test)}
models = {'Logistic Regression': lr_grid, 'Decision Tree': dt_grid, 'Random Forest': rf_grid}

# Loop through each model and print the classification report and confusion matrix
for name, y_pred in models_pred.items():
    with open('models/' + name.replace(' ', '_'), 'wb') as model_file:
        pickle.dump(models[name], model_file)
    print(f"{name}:\n")
    print(classification_report(y_test, y_pred))  # Print the classification report
    cm = confusion_matrix(y_test, y_pred)  # Get the confusion matrix
    print("Confusion Matrix:\n", cm)  # Print the confusion matrix
    
    # Plot the confusion matrix
    plt.figure(figsize=(10, 7))  # Set the figure size
    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues")  # Create a heatmap
    plt.title(f"{name} Confusion Matrix")  # Set the title
    plt.xlabel("Predicted")  # Set the x-axis label
    plt.ylabel("Actual")  # Set the y-axis label
    plt.show()  # Display the plot
