## EDA

In [1]:
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler



In [2]:
import pandas as pd

# Replace 'filename.csv' with the path to your CSV file
df = pd.read_csv('function_assignment.csv')

# Display the first few rows of the DataFrame
print(df.head())


                                          function                 job_title  \
0  Sales/ Business Development/ Account Management  Head - Digital Portfolio   
1                       IT/ Information Technology             Software Head   
2                            Software Architecting         iPhone Developers   
3                            Software Architecting    Html/css Developer Job   
4                            Software Architecting     Asp.net Developer Job   

                                                 jds  
0                                                NaN  
1  10-15 years of experience in a VFX facility Ad...  
2  Candidate Should have Strong OO design and pro...  
3  Job Description Must have at -least . Years ex...  
4  Develop ASP.netweb applicationsPerform unit te...  


In [3]:

print('Shape:', df.shape)
print('Columns:', df.columns.tolist())


print(df.info())
print(df.describe())


print(df.isnull().sum())


Shape: (10145, 3)
Columns: ['function', 'job_title', 'jds']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10145 entries, 0 to 10144
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   function   10145 non-null  object
 1   job_title  10145 non-null  object
 2   jds        8349 non-null   object
dtypes: object(3)
memory usage: 237.9+ KB
None
                                               function  \
count                                             10145   
unique                                              351   
top     Sales/ Business Development/ Account Management   
freq                                               1245   

                           job_title  \
count                          10145   
unique                          7236   
top     Business Development Manager   
freq                              41   

                                                      jds  
count                            

In [4]:
# # Handling Missing Values
# most_frequent_value = df['jds'].mode()[0]  # Calculate the most frequent value
# df['jds_clean'] = df['jds'].fillna(most_frequent_value)


In [5]:
# Check Duplicates
df.duplicated().sum()


1505

In [6]:
# Check for duplicates in specific columns
duplicates_in_columns = df.duplicated(subset=['function', 'job_title', 'jds']).sum()
print("Number of duplicates in specified columns:", duplicates_in_columns)


Number of duplicates in specified columns: 1505


In [7]:
# Remove all duplicates from the DataFrame
df_cleaned = df.drop_duplicates()

# Check the shape of the cleaned DataFrame to verify duplicates were removed
print("Shape after removing duplicates:", df_cleaned.shape)

Shape after removing duplicates: (8640, 3)


## Data Preprocessing

In [8]:
# data preprocess

# Drop rows where 'jds' is missing
df = df.dropna(subset=['jds'])

# Initialize a TfidfVectorizer
tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

features = tfidf.fit_transform(df.jds).toarray()

features_df = pd.DataFrame(features, columns=tfidf.get_feature_names_out())
preprocessed_df = pd.concat([df.reset_index(drop=True), features_df], axis=1)


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier


X = features
y = df['job_title'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

## Model Training and selction


In [10]:
# Model Selection and Training
model = RandomForestClassifier(n_estimators=20, random_state=42)
model.fit(X_train, y_train)

In [14]:
# Model Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


Accuracy: 0.262874251497006


In [15]:
# Classification Report
print(classification_report(y_test, y_pred))

                                                                                        precision    recall  f1-score   support

                                           'Information Security & Compliance Manager'       0.00      0.00      0.00         0
                                                                        .NET Team Lead       0.00      0.00      0.00         0
                                                               .Net Developer - C#/asp       0.00      0.00      0.00         0
                                                               3D Animator/ 3D Modeler       0.00      0.00      0.00         0
                               A.con / Con / Sr, Con - Cybersecurity - Generic Profile       0.00      0.00      0.00         0
                                                                          AD - US Tax.       0.00      0.00      0.00         0
                                                              ADF Developer/ Team Lead       0.00      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [16]:
# Feature Importance
feature_importance = model.feature_importances_
feature_names = tfidf.get_feature_names_out()
important_features = sorted(zip(feature_names, feature_importance), key=lambda x: x[1], reverse=True)[:10]
print("Top 10 Important Features:")
for feature in important_features:
    print(feature)

Top 10 Important Features:
('experience', 0.0012972382240127895)
('project', 0.001141075182568446)
('business', 0.0010178691081956697)
('management', 0.0009628747954490444)
('team', 0.0009065787661781153)
('years', 0.0008796353590077545)
('skills', 0.000877598558309808)
('development', 0.0008724104751790544)
('knowledge', 0.0008300762447157508)
('sales', 0.0007779792838593991)


In [30]:
from sklearn.svm import SVC

# Model Selection and Training with Support Vector Machine (SVM)
svm_model = SVC(kernel='linear', random_state=42)
svm_model.fit(X_train, y_train)

# Model Evaluation for SVM
svm_y_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_y_pred)
print("SVM Accuracy:", svm_accuracy)

# Classification Report for SVM
print("SVM Classification Report:")
print(classification_report(y_test, svm_y_pred))


SVM Accuracy: 0.21377245508982035
SVM Classification Report:
                                                                                        precision    recall  f1-score   support

                                              AGM - Sales & Marketing (Cement Company)       0.00      0.00      0.00         1
                                                                        AGM - Treasury       0.00      0.00      0.00         1
                                      AGM / DGM - Production - Pharmaceutical Industry       0.00      0.00      0.00         1
                                                         AGM / GM Purchase & Contracts       0.00      0.00      0.00         1
                                                               AGM / Manager ( Legal )       0.00      0.00      0.00         1
                                                                    AGM /DGM - HR / IR       0.00      0.00      0.00         1
                                          

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
