# Exploratory Data Analysis - Resume Screening Platform

**Objective:** Understand the data, identify patterns, and prepare for feature engineering

**Dataset Sources:**
- Kaggle Resume Dataset
- IT Job Description Dataset
- Synthetic Hiring Labels

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

%matplotlib inline

## 1. Load Datasets

In [None]:
resumes_df = pd.read_csv('../data/processed/all_resumes.csv')
jds_df = pd.read_csv('../data/processed/all_jds.csv')
training_df = pd.read_csv('../data/processed/training_data.csv')

print(f"Resumes: {len(resumes_df):,} rows")
print(f"Job Descriptions: {len(jds_df):,} rows")
print(f"Training Data: {len(training_df):,} rows")

## 2. Resume Dataset Analysis

In [None]:
print("Resume Dataset Info:")
print(resumes_df.info())
print("\nFirst 3 rows:")
resumes_df.head(3)

In [None]:
missing = resumes_df.isnull().sum()
missing_pct = (missing / len(resumes_df) * 100).round(2)

missing_df = pd.DataFrame({
    'Missing Count': missing,
    'Percentage': missing_pct
}).sort_values('Missing Count', ascending=False)

print("Missing Values in Resume Dataset:")
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
if 'category' in resumes_df.columns:
    plt.figure(figsize=(14, 6))
    category_counts = resumes_df['category'].value_counts()
    
    plt.subplot(1, 2, 1)
    category_counts.head(15).plot(kind='barh')
    plt.title('Top 15 Resume Categories')
    plt.xlabel('Count')
    
    plt.subplot(1, 2, 2)
    plt.pie(category_counts.head(10).values, labels=category_counts.head(10).index, autopct='%1.1f%%')
    plt.title('Top 10 Categories Distribution')
    
    plt.tight_layout()
    plt.show()

In [None]:
if 'resume_text' in resumes_df.columns:
    resumes_df['text_length'] = resumes_df['resume_text'].str.len()
    resumes_df['word_count'] = resumes_df['resume_text'].str.split().str.len()
    
    print("Resume Text Statistics:")
    print(resumes_df[['text_length', 'word_count']].describe())
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].hist(resumes_df['word_count'], bins=50, edgecolor='black')
    axes[0].set_title('Distribution of Word Count')
    axes[0].set_xlabel('Word Count')
    axes[0].set_ylabel('Frequency')
    axes[0].axvline(resumes_df['word_count'].median(), color='red', linestyle='--', label='Median')
    axes[0].legend()
    
    axes[1].boxplot(resumes_df['word_count'].dropna())
    axes[1].set_title('Word Count Box Plot')
    axes[1].set_ylabel('Word Count')
    
    plt.tight_layout()
    plt.show()

## 3. Job Description Analysis

In [None]:
print("Job Description Dataset Info:")
print(jds_df.info())
print("\nSample JDs:")
jds_df.head(3)

In [None]:
if 'jd_text' in jds_df.columns:
    jds_df['text_length'] = jds_df['jd_text'].str.len()
    jds_df['word_count'] = jds_df['jd_text'].str.split().str.len()
    
    print("JD Text Statistics:")
    print(jds_df[['text_length', 'word_count']].describe())

## 4. Training Data Analysis

In [None]:
print("Training Dataset Info:")
print(training_df.info())
print("\nFirst few rows:")
training_df.head()

In [None]:
if 'selected' in training_df.columns:
    selected_counts = training_df['selected'].value_counts()
    
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    
    axes[0].bar(['Rejected', 'Selected'], selected_counts.values, color=['red', 'green'])
    axes[0].set_title('Class Distribution')
    axes[0].set_ylabel('Count')
    for i, v in enumerate(selected_counts.values):
        axes[0].text(i, v + 10, str(v), ha='center', fontweight='bold')
    
    axes[1].pie(selected_counts.values, labels=['Rejected', 'Selected'], 
                autopct='%1.1f%%', colors=['red', 'green'])
    axes[1].set_title('Selection Rate')
    
    plt.tight_layout()
    plt.show()
    
    print(f"\nSelection Rate: {training_df['selected'].mean():.1%}")

In [None]:
score_cols = ['skill_match_score', 'experience_score', 'education_score', 'final_score']
available_score_cols = [col for col in score_cols if col in training_df.columns]

if available_score_cols:
    fig, axes = plt.subplots(2, 2, figsize=(14, 10))
    axes = axes.ravel()
    
    for idx, col in enumerate(available_score_cols):
        axes[idx].hist(training_df[col], bins=30, edgecolor='black', alpha=0.7)
        axes[idx].set_title(f'{col.replace("_", " ").title()} Distribution')
        axes[idx].set_xlabel('Score')
        axes[idx].set_ylabel('Frequency')
        axes[idx].axvline(training_df[col].mean(), color='red', linestyle='--', label='Mean')
        axes[idx].legend()
    
    plt.tight_layout()
    plt.show()

In [None]:
if 'selected' in training_df.columns and available_score_cols:
    print("Score Statistics by Selection Status:\n")
    
    for col in available_score_cols:
        print(f"\n{col.replace('_', ' ').title()}:")
        print(training_df.groupby('selected')[col].describe())

In [None]:
if 'experience_years' in training_df.columns:
    plt.figure(figsize=(14, 5))
    
    plt.subplot(1, 2, 1)
    training_df['experience_years'].hist(bins=20, edgecolor='black')
    plt.title('Experience Distribution')
    plt.xlabel('Years of Experience')
    plt.ylabel('Frequency')
    
    plt.subplot(1, 2, 2)
    training_df.boxplot(column='experience_years', by='selected')
    plt.title('Experience by Selection Status')
    plt.suptitle('')
    plt.xlabel('Selected')
    plt.ylabel('Years of Experience')
    
    plt.tight_layout()
    plt.show()

In [None]:
if 'education_level' in training_df.columns:
    edu_counts = training_df['education_level'].value_counts()
    
    plt.figure(figsize=(12, 5))
    edu_counts.plot(kind='bar', color='skyblue', edgecolor='black')
    plt.title('Education Level Distribution')
    plt.xlabel('Education Level')
    plt.ylabel('Count')
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()
    
    edu_selected = pd.crosstab(training_df['education_level'], training_df['selected'], normalize='index')
    print("\nSelection Rate by Education Level:")
    print(edu_selected)

## 5. Correlation Analysis

In [None]:
numeric_cols = training_df.select_dtypes(include=[np.number]).columns.tolist()

if len(numeric_cols) > 1:
    plt.figure(figsize=(10, 8))
    correlation_matrix = training_df[numeric_cols].corr()
    sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0, 
                square=True, linewidths=1, cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Heatmap')
    plt.tight_layout()
    plt.show()

## 6. Summary Statistics

In [None]:
print("="*60)
print("DATASET SUMMARY")
print("="*60)
print(f"Total Resumes: {len(resumes_df):,}")
print(f"Total JDs: {len(jds_df):,}")
print(f"Training Samples: {len(training_df):,}")
if 'selected' in training_df.columns:
    print(f"Selection Rate: {training_df['selected'].mean():.1%}")
if 'final_score' in training_df.columns:
    print(f"Average Final Score: {training_df['final_score'].mean():.3f}")
print("="*60)