# Job Recommendation System - Exploratory Data Analysis

This notebook provides exploratory analysis of the job and candidate data.

In [None]:
import sys
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json

# Add src directory to path
sys.path.append(str(Path().absolute().parent / "src"))

from data_processing import CSVProcessor
from config.settings import RAW_DATA_DIR, PROCESSED_DATA_DIR

# Set up plotting
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

%matplotlib inline

## Load Data

In [None]:
# Initialize processor
processor = CSVProcessor()

# Load job data
jobs_df = processor.load_job_details()
print(f"Loaded {len(jobs_df)} job records")

# Load candidate data
candidates_df = processor.load_candidate_data()
print(f"Loaded {len(candidates_df)} candidate records")

## Job Data Analysis

In [None]:
# Basic info about jobs
print("Job Data Info:")
print(jobs_df.info())
print("\nFirst few rows:")
jobs_df.head()

In [None]:
# Company distribution
plt.figure(figsize=(12, 6))
top_companies = jobs_df['company_name'].value_counts().head(15)
plt.subplot(1, 2, 1)
top_companies.plot(kind='bar')
plt.title('Top 15 Companies by Job Postings')
plt.xticks(rotation=45)

# Job type distribution
plt.subplot(1, 2, 2)
jobs_df['job_type'].value_counts().plot(kind='pie', autopct='%1.1f%%')
plt.title('Job Type Distribution')
plt.tight_layout()
plt.show()

In [None]:
# Location analysis
plt.figure(figsize=(12, 6))
location_counts = jobs_df['location'].value_counts().head(10)
location_counts.plot(kind='bar')
plt.title('Top 10 Job Locations')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Candidate Data Analysis

In [None]:
# Basic info about candidates
print("Candidate Data Info:")
print(candidates_df.info())
print("\nFirst few rows:")
candidates_df.head()

In [None]:
# CGPA distribution
plt.figure(figsize=(10, 6))
candidates_df['CGPA'] = pd.to_numeric(candidates_df['CGPA'], errors='coerce')
candidates_df['CGPA'].dropna().hist(bins=20, alpha=0.7)
plt.title('CGPA Distribution of Candidates')
plt.xlabel('CGPA')
plt.ylabel('Frequency')
plt.show()

In [None]:
# Major distribution
plt.figure(figsize=(10, 6))
major_counts = candidates_df['Major'].value_counts()
major_counts.plot(kind='bar')
plt.title('Distribution of Candidate Majors')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## Summary Statistics

In [None]:
# Generate summary
job_summary = processor.get_data_summary(jobs_df)
candidate_summary = processor.get_data_summary(candidates_df)

print("Job Data Summary:")
for key, value in job_summary.items():
    if key not in ['columns', 'missing_values', 'data_types']:
        print(f"{key}: {value}")

print("\nCandidate Data Summary:")
for key, value in candidate_summary.items():
    if key not in ['columns', 'missing_values', 'data_types']:
        print(f"{key}: {value}")