<a href="https://colab.research.google.com/github/Ritkingdom/devtraining-needit-sandiego/blob/main/Untitled17.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
# Import necessary libraries
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Load the data
data = pd.read_csv('/content/sample_data/train_dataset.csv')

# Check the first few rows of the data
print(data.head())


                           candidateAId                          candidateBId  \
0  8ab47434-09a9-44e6-8c77-f9fd20c57765  d7cbd002-5423-4dae-82d9-3a629ec361bb   
1  53c11bf9-3ec7-4909-a9d1-487692e72415  e957aff1-583b-11ef-8a84-4201ac164110   
2  4617b14d-ca26-11ee-a4ba-42010a400021  a2d2933e-c5bb-11ee-a4ba-42010a400021   
3  c227ffa7-c459-11ee-a4ba-42010a400021  e0abf437-c7b8-11ee-a4ba-42010a400021   
4  fd4e9be6-c4f2-11ee-a4ba-42010a400021  264bd6d6-cca8-11ee-a4ba-42010a400021   

                               winnerId  \
0  8ab47434-09a9-44e6-8c77-f9fd20c57765   
1  e957aff1-583b-11ef-8a84-4201ac164110   
2  4617b14d-ca26-11ee-a4ba-42010a400021   
3  c227ffa7-c459-11ee-a4ba-42010a400021   
4  fd4e9be6-c4f2-11ee-a4ba-42010a400021   

                                candidateATranscript  \
0  {'pairs': [['Interviewer: Hello and welcome to...   
1  {'pairs': [['Interviewer: Hello and welcome to...   
2  {'pairs': [['Interviewer: Hello and welcome to...   
3  {'pairs': [['Interviewer: H

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [12]:
# Load training and testing data from CSV files
train_data = pd.read_csv('/content/sample_data/train_dataset.csv')
test_data = pd.read_csv('/content/sample_data/test_dataset.csv')

# Define text cleaning function
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    # Remove special characters and punctuation
    text = re.sub(r'\d+', '', text)  # Remove digits
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = text.lower()  # Convert to lowercase
    tokens = text.split()  # Split into words
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]  # Lemmatize and remove stop words
    return ' '.join(tokens)

# Apply cleaning to the transcripts and resume data in training set
train_data['clean_candidateA_transcript'] = train_data['candidateATranscript'].apply(clean_text)
train_data['clean_candidateB_transcript'] = train_data['candidateBTranscript'].apply(clean_text)
train_data['clean_candidateA_resume'] = train_data['candidateAResume'].apply(clean_text)
train_data['clean_candidateB_resume'] = train_data['candidateBResume'].apply(clean_text)

# Combine transcript and resume data for each candidate in training set
train_data['candidateA_combined'] = train_data['clean_candidateA_transcript'] + ' ' + train_data['clean_candidateA_resume']
train_data['candidateB_combined'] = train_data['clean_candidateB_transcript'] + ' ' + train_data['clean_candidateB_resume']

# Apply cleaning to the transcripts and resume data in testing set
test_data['clean_candidateA_transcript'] = test_data['candidateATranscript'].apply(clean_text)
test_data['clean_candidateB_transcript'] = test_data['candidateBTranscript'].apply(clean_text)
test_data['clean_candidateA_resume'] = test_data['candidateAResume'].apply(clean_text)
test_data['clean_candidateB_resume'] = test_data['candidateBResume'].apply(clean_text)

# Combine transcript and resume data for each candidate in testing set
test_data['candidateA_combined'] = test_data['clean_candidateA_transcript'] + ' ' + test_data['clean_candidateA_resume']
test_data['candidateB_combined'] = test_data['clean_candidateB_transcript'] + ' ' + test_data['clean_candidateB_resume']


In [13]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer()

# Fit TF-IDF on the combined candidate data from the training set
all_combined_text_train = pd.concat([train_data['candidateA_combined'], train_data['candidateB_combined']])
tfidf.fit(all_combined_text_train)

# Transform candidate data using the fitted TF-IDF
candidateA_features_train = tfidf.transform(train_data['candidateA_combined'])
candidateB_features_train = tfidf.transform(train_data['candidateB_combined'])
candidateA_features_test = tfidf.transform(test_data['candidateA_combined'])
candidateB_features_test = tfidf.transform(test_data['candidateB_combined'])

# Calculate the difference between candidate A and candidate B features for training and testing
feature_difference_train = candidateA_features_train - candidateB_features_train
feature_difference_test = candidateA_features_test - candidateB_features_test


In [14]:
# Define the target variable for training (1 if candidate A is preferred, 0 otherwise)
y_train = (train_data['winnerId'] == train_data['candidateAId']).astype(int)

# Define the target variable for testing (1 if candidate A is preferred, 0 otherwise)
y_test = (test_data['winnerId'] == test_data['candidateAId']).astype(int)

# Initialize and train a logistic regression model
model = LogisticRegression()
model.fit(feature_difference_train, y_train)

# Make predictions on the test set
y_pred = model.predict(feature_difference_test)

# Evaluate model performance
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")
print(classification_report(y_test, y_pred))


Model Accuracy: 0.65
              precision    recall  f1-score   support

           0       0.79      0.22      0.34        68
           1       0.63      0.96      0.76        96

    accuracy                           0.65       164
   macro avg       0.71      0.59      0.55       164
weighted avg       0.70      0.65      0.59       164



In [8]:
# Check for imbalance in the roles
role_counts = data['role'].value_counts()
print(role_counts)

# Check if there's a significant imbalance in transcript lengths
data['candidateA_transcript_length'] = data['candidateATranscript'].apply(len)
data['candidateB_transcript_length'] = data['candidateBTranscript'].apply(len)

transcript_length_diff = abs(data['candidateA_transcript_length'] - data['candidateB_transcript_length'])
print(transcript_length_diff.describe())


role
communications                                                  3
ops-or-gtm                                                      3
headhunter-or-recruiter                                         2
writer                                                          2
technical-project-manager-v3                                    2
has-scraping-experience-a                                       1
ml-engineer-v3                                                  1
full-stack-engineer-with-experience-in-next-and-typescript-a    1
backend-engineer-who-s-worked-at-a-startup-v3                   1
leetcode-expert-b                                               1
leetcode-expert-a                                               1
financial-advisor                                               1
marketer                                                        1
Name: count, dtype: int64
count      20.000000
mean     2672.200000
std      1922.317286
min       156.000000
25%       689.000000
50%   

In [9]:
# Example: Using role-specific weights for balanced learning
from sklearn.utils.class_weight import compute_class_weight

class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = {0: class_weights[0], 1: class_weights[1]}

# Retrain model with class weights
model_weighted = LogisticRegression(class_weight=class_weight_dict)
model_weighted.fit(X_train, y_train)

# Evaluate the weighted model
y_pred_weighted = model_weighted.predict(X_test)
accuracy_weighted = accuracy_score(y_test, y_pred_weighted)
print(f"Weighted Model Accuracy: {accuracy_weighted:.2f}")
print(classification_report(y_test, y_pred_weighted))


Weighted Model Accuracy: 0.75
              precision    recall  f1-score   support

           0       1.00      0.50      0.67         2
           1       0.67      1.00      0.80         2

    accuracy                           0.75         4
   macro avg       0.83      0.75      0.73         4
weighted avg       0.83      0.75      0.73         4

