In [1]:
import pandas as pd 
import numpy as np 
from sklearn.preprocessing import LabelEncoder,StandardScaler 
from sklearn.impute import KNNImputer
from imblearn.over_sampling import SMOTENC

In [3]:
df1=pd.read_csv('../Dataset/raw/AI_Resume_Screening.csv')
df=df1.copy()

# FIXING NULL VALUES

In [4]:
df['Certifications']=df['Certifications'].fillna('None')
print("No of Null values:",df['Certifications'].isna().sum())

No of Null values: 0


# OUTLIER REMOVAL

In [5]:
df['AI Score (0-100)']=np.log(1/df['AI Score (0-100)'])
max_limit=df['AI Score (0-100)'].mean()+3*df['AI Score (0-100)'].std()
min_limit=df['AI Score (0-100)'].mean()-3*df['AI Score (0-100)'].std()

df=df[(df['AI Score (0-100)']<max_limit) & (df['AI Score (0-100)']>min_limit)]


# FIXING CLASS IMBALANCE 

In [6]:
print("No of entires with Decision as Hire:",df[df['Recruiter Decision']=='Hire'].shape)
print("No of entires with Decision as Reject:",df[df['Recruiter Decision']=='Reject'].shape)

No of entires with Decision as Hire: (812, 11)
No of entires with Decision as Reject: (167, 11)


In [7]:
x=df.drop(columns=['Recruiter Decision'])
y=df['Recruiter Decision']
over_sampler=SMOTENC(sampling_strategy={'Hire':1000,'Reject':1000},categorical_features=['Name','Skills','Education','Certifications','Job Role'])
x_resampled,y_resampled=over_sampler.fit_resample(x,y)

In [8]:
new_data=pd.concat([x_resampled,y_resampled],axis=1)

In [9]:
new_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Resume_ID               2000 non-null   int64  
 1   Name                    2000 non-null   object 
 2   Skills                  2000 non-null   object 
 3   Experience (Years)      2000 non-null   int64  
 4   Education               2000 non-null   object 
 5   Certifications          2000 non-null   object 
 6   Job Role                2000 non-null   object 
 7   Salary Expectation ($)  2000 non-null   int64  
 8   Projects Count          2000 non-null   int64  
 9   AI Score (0-100)        2000 non-null   float64
 10  Recruiter Decision      2000 non-null   object 
dtypes: float64(1), int64(4), object(6)
memory usage: 172.0+ KB


# FEATURE SELECTION BASED ON ALREADY PERFORMED ANALYSIS

In [10]:
required_features=['Experience (Years)','Projects Count','Recruiter Decision','AI Score (0-100)']

df_to_use=new_data[required_features]
df_to_use.sample(2)

Unnamed: 0,Experience (Years),Projects Count,Recruiter Decision,AI Score (0-100)
164,2,7,Hire,-4.382027
1514,1,2,Reject,-3.912023


# SCALING OF THE FEATURES NOT REQUIRED HERE AS THE RANGE OF NUMERICAL FEATURES IS SMALL

In [11]:
max_years_of_experience=max(df_to_use['Experience (Years)'])
min_years_of_experience=min(df_to_use['Experience (Years)'])
max_no_of_projects=max(df_to_use['Projects Count'])
min_no_of_projects=min(df_to_use['Projects Count'])

print("Max number of Projects:",max_no_of_projects)
print("Min number of Projects:",min_no_of_projects)
print("Max years of experience:",max_years_of_experience)
print('Min years of experience',min_years_of_experience)

Max number of Projects: 10
Min number of Projects: 0
Max years of experience: 10
Min years of experience 0


# ENCODING ISNT NECESSARY AS THE ONLY THE TARGET FEATURE IS CATEGORICAL

In [13]:
df_to_use.to_csv('../Dataset/preprocessed/Preprocessed_dataset.csv',index=False)