In [7]:
# Import required libraries
import pandas as pd
import numpy as np
import sys
import pickle

# Add src directory to path
sys.path.append('../src')

# Import preprocessing module
from preprocess import clean_text, clean_resumes

# TensorFlow for tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Scikit-learn for label encoding
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

print("âœ… Libraries imported successfully!")

âœ… Libraries imported successfully!


## 1. Load Dataset

In [8]:
# Load the resume dataset
df = pd.read_csv('../dataset/resumes.csv')

print(f"Dataset loaded: {df.shape[0]} resumes, {df.shape[1]} columns")
print(f"Job categories: {df['Category'].nunique()}")
df.head()

Dataset loaded: 30 resumes, 2 columns
Job categories: 14


Unnamed: 0,Resume,Category
0,Experienced Python developer with 5+ years in ...,Data Scientist
1,Senior Java developer with expertise in Spring...,Java Developer
2,"Full-stack web developer skilled in React, Nod...",Web Developer
3,"Data scientist with expertise in Python, R, an...",Data Scientist
4,"DevOps engineer with experience in Docker, Kub...",DevOps Engineer


## 2. Text Cleaning

In [9]:
# Test cleaning on a single resume
print("\n" + "="*80)
print("BEFORE CLEANING:")
print("="*80)
print(df['Resume'].iloc[0][:200])

cleaned_sample = clean_text(df['Resume'].iloc[0])
print("\n" + "="*80)
print("AFTER CLEANING:")
print("="*80)
print(cleaned_sample[:200])


BEFORE CLEANING:
Experienced Python developer with 5+ years in machine learning and deep learning. Proficient in TensorFlow, PyTorch, scikit-learn, and NLP. Built recommendation systems and predictive models. Strong u

AFTER CLEANING:
experienced python developer with years in machine learning and deep learning proficient in tensorflow pytorch scikitlearn and nlp built recommendation systems and predictive models strong understandi


In [10]:
# Clean all resumes
print("\nðŸ”„ Cleaning all resumes...")
df['Cleaned_Resume'] = clean_resumes(df['Resume'].tolist(), remove_stopwords=False)

print("âœ… Text cleaning complete!")
print(f"\nSample cleaned resume:")
print(df['Cleaned_Resume'].iloc[0][:150])


ðŸ”„ Cleaning all resumes...
âœ… Text cleaning complete!

Sample cleaned resume:
experienced python developer with years in machine learning and deep learning proficient in tensorflow pytorch scikitlearn and nlp built recommendatio


## 3. Tokenization

In [11]:
# Initialize tokenizer
# num_words limits vocabulary to most common words (helps reduce model complexity)
tokenizer = Tokenizer(num_words=5000, oov_token='<OOV>')

# Fit tokenizer on cleaned text
tokenizer.fit_on_texts(df['Cleaned_Resume'])

# Get vocabulary size
vocab_size = len(tokenizer.word_index) + 1  # +1 for padding token

print(f"\nâœ… Tokenizer fitted!")
print(f"Vocabulary size: {vocab_size}")
print(f"\nMost common words:")
print(list(tokenizer.word_index.items())[:20])


âœ… Tokenizer fitted!
Vocabulary size: 286

Most common words:
[('<OOV>', 1), ('and', 2), ('with', 3), ('in', 4), ('experience', 5), ('proficient', 6), ('developer', 7), ('strong', 8), ('of', 9), ('expertise', 10), ('data', 11), ('knowledge', 12), ('python', 13), ('learning', 14), ('built', 15), ('react', 16), ('building', 17), ('web', 18), ('engineer', 19), ('testing', 20)]


In [12]:
# Convert text to sequences
sequences = tokenizer.texts_to_sequences(df['Cleaned_Resume'])

print(f"\nâœ… Text converted to sequences!")
print(f"\nSample sequence (first 20 tokens):")
print(sequences[0][:20])
print(f"\nOriginal text: {df['Cleaned_Resume'].iloc[0][:100]}")


âœ… Text converted to sequences!

Sample sequence (first 20 tokens):
[109, 13, 7, 3, 21, 4, 22, 14, 2, 37, 14, 6, 4, 38, 110, 62, 2, 63, 15, 111]

Original text: experienced python developer with years in machine learning and deep learning proficient in tensorfl


## 4. Sequence Padding

In [13]:
# Calculate sequence lengths
sequence_lengths = [len(seq) for seq in sequences]

print("Sequence length statistics:")
print(f"Min length: {min(sequence_lengths)}")
print(f"Max length: {max(sequence_lengths)}")
print(f"Mean length: {np.mean(sequence_lengths):.2f}")
print(f"Median length: {np.median(sequence_lengths):.2f}")

Sequence length statistics:
Min length: 22
Max length: 33
Mean length: 25.30
Median length: 24.50


In [14]:
# Set max sequence length (use median or a reasonable value)
max_length = 100  # You can adjust based on your data

# Pad sequences
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')

print(f"\nâœ… Sequences padded!")
print(f"Max sequence length: {max_length}")
print(f"Padded sequences shape: {padded_sequences.shape}")
print(f"\nSample padded sequence:")
print(padded_sequences[0])


âœ… Sequences padded!
Max sequence length: 100
Padded sequences shape: (30, 100)

Sample padded sequence:
[109  13   7   3  21   4  22  14   2  37  14   6   4  38 110  62   2  63
  15 111  64   2  65  39   8  40   9 112 113 114 115   2 116   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0]


## 5. Label Encoding

In [15]:
# Initialize label encoder
label_encoder = LabelEncoder()

# Encode categories
encoded_labels = label_encoder.fit_transform(df['Category'])

print(f"\nâœ… Labels encoded!")
print(f"Number of classes: {len(label_encoder.classes_)}")
print(f"\nClass mapping:")
for i, category in enumerate(label_encoder.classes_):
    print(f"{i}: {category}")


âœ… Labels encoded!
Number of classes: 14

Class mapping:
0: Backend Developer
1: Business Analyst
2: Cloud Architect
3: Data Analyst
4: Data Engineer
5: Data Scientist
6: DevOps Engineer
7: Frontend Developer
8: Java Developer
9: Mobile Developer
10: Python Developer
11: QA Engineer
12: Security Analyst
13: Web Developer


In [16]:
# Display sample data
print("\nSample data after preprocessing:")
print(f"Original text: {df['Resume'].iloc[0][:100]}...")
print(f"Cleaned text: {df['Cleaned_Resume'].iloc[0][:100]}...")
print(f"Sequence: {padded_sequences[0][:20]}...")
print(f"Category: {df['Category'].iloc[0]}")
print(f"Encoded label: {encoded_labels[0]}")


Sample data after preprocessing:
Original text: Experienced Python developer with 5+ years in machine learning and deep learning. Proficient in Tens...
Cleaned text: experienced python developer with years in machine learning and deep learning proficient in tensorfl...
Sequence: [109  13   7   3  21   4  22  14   2  37  14   6   4  38 110  62   2  63
  15 111]...
Category: Data Scientist
Encoded label: 5


## 6. Train-Test Split

In [17]:
# Split data into training and testing sets (without stratify due to small sample size)
X_train, X_test, y_train, y_test = train_test_split(
    padded_sequences, 
    encoded_labels, 
    test_size=0.2, 
    random_state=42
)

print(f"\n✅ Data split complete!")
print(f"Training samples: {len(X_train)}")
print(f"Testing samples: {len(X_test)}")
print(f"\nTraining set shape: {X_train.shape}")
print(f"Testing set shape: {X_test.shape}")


✅ Data split complete!
Training samples: 24
Testing samples: 6

Training set shape: (24, 100)
Testing set shape: (6, 100)


## 7. Save Preprocessing Artifacts

In [18]:
# Save tokenizer
import pickle

with open('../models/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

print("âœ… Tokenizer saved to models/tokenizer.pkl")

âœ… Tokenizer saved to models/tokenizer.pkl


In [19]:
# Save label encoder
with open('../models/label_encoder.pkl', 'wb') as f:
    pickle.dump(label_encoder, f)

print("âœ… Label encoder saved to models/label_encoder.pkl")

âœ… Label encoder saved to models/label_encoder.pkl


In [20]:
# Save preprocessed data
np.save('../models/X_train.npy', X_train)
np.save('../models/X_test.npy', X_test)
np.save('../models/y_train.npy', y_train)
np.save('../models/y_test.npy', y_test)

print("âœ… Preprocessed data saved:")
print("   - models/X_train.npy")
print("   - models/X_test.npy")
print("   - models/y_train.npy")
print("   - models/y_test.npy")

âœ… Preprocessed data saved:
   - models/X_train.npy
   - models/X_test.npy
   - models/y_train.npy
   - models/y_test.npy


In [21]:
# Save preprocessing info to text file
preprocessing_info = f"""PREPROCESSING INFORMATION - DAY 2
=====================================

Dataset:
- Total samples: {len(df)}
- Training samples: {len(X_train)}
- Testing samples: {len(X_test)}

Vocabulary:
- Vocabulary size: {vocab_size}
- Max words in tokenizer: 5000
- OOV token: <OOV>

Sequences:
- Max sequence length: {max_length}
- Padding: post
- Truncating: post

Labels:
- Number of classes: {len(label_encoder.classes_)}
- Classes: {', '.join(label_encoder.classes_)}

Text Cleaning:
- Lowercasing: Yes
- Special characters removed: Yes
- Stopwords removed: No
- Extra spaces removed: Yes

Files Saved:
- models/tokenizer.pkl
- models/label_encoder.pkl
- models/X_train.npy
- models/X_test.npy
- models/y_train.npy
- models/y_test.npy

Ready for Day 3: Model Building!
"""

with open('../results/preprocessing_info.txt', 'w') as f:
    f.write(preprocessing_info)

print("âœ… Preprocessing info saved to results/preprocessing_info.txt")
print("\n" + preprocessing_info)

âœ… Preprocessing info saved to results/preprocessing_info.txt

PREPROCESSING INFORMATION - DAY 2

Dataset:
- Total samples: 30
- Training samples: 24
- Testing samples: 6

Vocabulary:
- Vocabulary size: 286
- Max words in tokenizer: 5000
- OOV token: <OOV>

Sequences:
- Max sequence length: 100
- Padding: post
- Truncating: post

Labels:
- Number of classes: 14
- Classes: Backend Developer, Business Analyst, Cloud Architect, Data Analyst, Data Engineer, Data Scientist, DevOps Engineer, Frontend Developer, Java Developer, Mobile Developer, Python Developer, QA Engineer, Security Analyst, Web Developer

Text Cleaning:
- Lowercasing: Yes
- Special characters removed: Yes
- Stopwords removed: No
- Extra spaces removed: Yes

Files Saved:
- models/tokenizer.pkl
- models/label_encoder.pkl
- models/X_train.npy
- models/X_test.npy
- models/y_train.npy
- models/y_test.npy

Ready for Day 3: Model Building!



## 8. Summary

In [22]:
# Print final summary
print("\n" + "="*80)
print("DAY 2 SUMMARY - TEXT PREPROCESSING & TOKENIZATION")
print("="*80)
print(f"\nâœ… Text Cleaning: Complete")
print(f"   - Lowercasing applied")
print(f"   - Special characters removed")
print(f"   - Extra spaces normalized")

print(f"\nâœ… Tokenization: Complete")
print(f"   - Vocabulary size: {vocab_size}")
print(f"   - Max sequence length: {max_length}")

print(f"\nâœ… Label Encoding: Complete")
print(f"   - Number of classes: {len(label_encoder.classes_)}")

print(f"\nâœ… Data Split: Complete")
print(f"   - Training: {len(X_train)} samples")
print(f"   - Testing: {len(X_test)} samples")

print(f"\nâœ… All preprocessing artifacts saved!")
print(f"\nðŸš€ Ready for Day 3: LSTM Model Building!")
print("="*80)


DAY 2 SUMMARY - TEXT PREPROCESSING & TOKENIZATION

âœ… Text Cleaning: Complete
   - Lowercasing applied
   - Special characters removed
   - Extra spaces normalized

âœ… Tokenization: Complete
   - Vocabulary size: 286
   - Max sequence length: 100

âœ… Label Encoding: Complete
   - Number of classes: 14

âœ… Data Split: Complete
   - Training: 24 samples
   - Testing: 6 samples

âœ… All preprocessing artifacts saved!

ðŸš€ Ready for Day 3: LSTM Model Building!
