## 1. Import Required Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Set style for visualizations
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (10, 6)

## 2. Load Dataset

In [None]:
# Load the skills dataset
df = pd.read_csv('../dataset/skills_dataset.csv')
print("Dataset loaded successfully!")
df.head(10)

## 3. Basic Dataset Information

In [None]:
# Display shape of dataset
print(f"Dataset Shape: {df.shape}")
print(f"Number of records: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

In [None]:
# Display data types and info
print("\nDataset Info:")
df.info()

In [None]:
# Display column names
print("\nColumn Names:")
print(df.columns.tolist())

## 4. Check for Missing Values

In [None]:
# Check for missing values
print("Missing Values:")
missing_values = df.isnull().sum()
print(missing_values)
print(f"\nTotal missing values: {missing_values.sum()}")

## 5. Job Role Distribution

In [None]:
# Value counts of job roles
print("Job Role Distribution:")
job_role_counts = df['job_role'].value_counts()
print(job_role_counts)

In [None]:
# Visualize job role distribution
plt.figure(figsize=(12, 6))
job_role_counts.plot(kind='bar', color='skyblue', edgecolor='black')
plt.title('Distribution of Job Roles', fontsize=16, fontweight='bold')
plt.xlabel('Job Role', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

## 6. Skills Analysis

In [None]:
# Display sample skills
print("Sample Skills:")
print(df['skills'].head(15))

In [None]:
# Count unique job roles
print(f"\nNumber of unique job roles: {df['job_role'].nunique()}")
print("\nUnique job roles:")
print(df['job_role'].unique())

## 7. Summary Statistics

In [None]:
# Display basic statistics
print("Dataset Summary:")
print(f"Total entries: {len(df)}")
print(f"Unique job roles: {df['job_role'].nunique()}")
print(f"Most common job role: {df['job_role'].mode()[0]}")
print(f"Count of most common role: {df['job_role'].value_counts().iloc[0]}")

## 8. Key Observations

**Dataset Overview:**
- The dataset contains skill sets mapped to job roles
- Each record represents a combination of skills and the corresponding job role
- No missing values detected in the initial dataset

**Next Steps:**
1. Perform text preprocessing on skills column
2. Feature engineering to extract individual skills
3. Build classification models for career recommendation
4. Analyze skill gaps for career transitions