In [None]:
# Course Knowledge Graph - Data Exploration

This notebook demonstrates how to explore the Coursera dataset and test the Neo4j knowledge graph.

2


In [None]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

## 1. Load and Explore the Dataset

In [None]:
# Load the dataset
df = pd.read_csv('data/Coursera.csv')

# Display basic information
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names and types:")
print(df.dtypes)
print(f"\nFirst few rows:")
df.head()

In [None]:
# Check for missing values
print("Missing values per column:")
print(df.isnull().sum())
print(f"\nTotal missing values: {df.isnull().sum().sum()}")

# Data quality check
print(f"\nDuplicate rows: {df.duplicated().sum()}")

## 2. Analyze Course Ratings

In [None]:
# Convert rating to numeric
df['Course Rating'] = pd.to_numeric(df['Course Rating'], errors='coerce')

# Rating statistics
print("Course Rating Statistics:")
print(df['Course Rating'].describe())

# Visualize rating distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Histogram
axes[0].hist(df['Course Rating'].dropna(), bins=20, edgecolor='black', alpha=0.7)
axes[0].set_xlabel('Course Rating')
axes[0].set_ylabel('Frequency')
axes[0].set_title('Distribution of Course Ratings')
axes[0].axvline(df['Course Rating'].mean(), color='red', linestyle='--', label=f'Mean: {df["Course Rating"].mean():.2f}')
axes[0].legend()

# Box plot
axes[1].boxplot(df['Course Rating'].dropna(), vert=True)
axes[1].set_ylabel('Course Rating')
axes[1].set_title('Box Plot of Course Ratings')

plt.tight_layout()
plt.show()

## 3. Analyze Difficulty Levels

In [None]:
# Difficulty level distribution
difficulty_counts = df['Difficulty Level'].value_counts()
print("Courses by Difficulty Level:")
print(difficulty_counts)

# Visualize
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Pie chart
axes[0].pie(difficulty_counts.values, labels=difficulty_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Course Distribution by Difficulty Level')

# Bar chart with average rating
difficulty_ratings = df.groupby('Difficulty Level')['Course Rating'].mean().sort_values(ascending=False)
difficulty_ratings.plot(kind='bar', ax=axes[1], color='steelblue')
axes[1].set_xlabel('Difficulty Level')
axes[1].set_ylabel('Average Rating')
axes[1].set_title('Average Rating by Difficulty Level')
axes[1].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

## 4. Analyze Universities

In [None]:
# Top universities by course count
top_universities = df['University'].value_counts().head(15)
print("Top 15 Universities by Course Count:")
print(top_universities)

# Visualize
plt.figure(figsize=(12, 6))
top_universities.plot(kind='barh', color='coral')
plt.xlabel('Number of Courses')
plt.ylabel('University')
plt.title('Top 15 Universities by Course Count')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [None]:
# University quality analysis
university_stats = df.groupby('University').agg({
    'Course Rating': ['mean', 'count']
}).round(2)
university_stats.columns = ['Avg_Rating', 'Course_Count']
university_stats = university_stats[university_stats['Course_Count'] >= 5]  # At least 5 courses
top_quality = university_stats.sort_values('Avg_Rating', ascending=False).head(10)

print("Top 10 Universities by Average Rating (with 5+ courses):")
print(top_quality)

## 5. Analyze Skills

In [None]:
# Parse skills
all_skills = []
for skills_str in df['Skills'].dropna():
    skills = [s.strip() for s in str(skills_str).split('  ')]
    skills = [s for s in skills if s and len(s) > 1]
    all_skills.extend(skills)

# Count skill frequency
skill_counts = Counter(all_skills)
top_skills = skill_counts.most_common(20)

print(f"Total unique skills: {len(skill_counts)}")
print(f"\nTop 20 Most Common Skills:")
for skill, count in top_skills:
    print(f"{skill}: {count}")

In [None]:
# Visualize top skills
skills_df = pd.DataFrame(top_skills, columns=['Skill', 'Count'])

plt.figure(figsize=(12, 8))
plt.barh(skills_df['Skill'], skills_df['Count'], color='skyblue')
plt.xlabel('Number of Courses')
plt.ylabel('Skill')
plt.title('Top 20 Most Common Skills in Courses')
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

## 6. Test Neo4j Connection and Queries

Before running this section, make sure you have:
1. Installed Neo4j (Desktop or Server)
2. Created a database with credentials matching your .env file
3. Installed required packages: `pip install -r requirements.txt`
4. Run the data import: `python data_loader.py`

In [None]:
# Test Neo4j connection (uncomment after setting up Neo4j)
"""
from database import neo4j_conn

# Test connection
try:
    result = neo4j_conn.execute_query("RETURN 'Connection successful!' as message")
    print(result[0]['message'])
except Exception as e:
    print(f"Connection failed: {e}")
"""

In [None]:
# Example: Query courses from Neo4j
"""
from services import CourseService, RecommendationService, StatsService

# Get statistics
stats = StatsService.get_database_stats()
print("Database Statistics:")
for key, value in stats.items():
    if key not in ['top_skills', 'top_universities']:
        print(f"  {key}: {value}")

# Get top skills
top_skills = StatsService.get_all_skills(limit=10)
print("\nTop 10 Skills:")
for skill in top_skills:
    print(f"  {skill.name}: {skill.course_count} courses")
"""

In [None]:
# Example: Search for Python courses
"""
courses = CourseService.search_courses(
    query="Python",
    min_rating=4.5,
    limit=5
)

print("Top Python Courses (Rating >= 4.5):")
for course in courses:
    print(f"\n{course.name}")
    print(f"  University: {course.university}")
    print(f"  Rating: {course.rating}")
    print(f"  Difficulty: {course.difficulty}")
    print(f"  Skills: {', '.join(course.skills[:5])}")
"""

In [None]:
# Example: Get course recommendations
"""
# Get similar courses to a specific course
# First, let's find a course ID
search_result = CourseService.search_courses(query="Machine Learning", limit=1)
if search_result:
    course_id = search_result[0].id
    print(f"Base course: {search_result[0].name}")
    
    # Get similar courses
    similar = RecommendationService.get_similar_courses(course_id, limit=5)
    print("\nSimilar Courses:")
    for course in similar:
        print(f"  - {course.name} ({course.rating}⭐)")
"""

In [None]:
# Example: Get a learning path
"""
learning_path = RecommendationService.get_learning_path(
    target_skill="Machine Learning",
    max_courses=5
)

print("Learning Path to Machine Learning:")
for i, course in enumerate(learning_path, 1):
    print(f"\n{i}. {course.name}")
    print(f"   Difficulty: {course.difficulty}")
    print(f"   Rating: {course.rating}⭐")
    print(f"   University: {course.university}")
"""

## Summary

This notebook demonstrates:
1. ✅ Dataset exploration and statistics
2. ✅ Visualization of course ratings, difficulty levels, universities, and skills
3. ✅ Data quality analysis
4. ✅ Examples for testing the Neo4j knowledge graph
5. ✅ API usage examples for searching, recommendations, and learning paths

**Next Steps:**
- Set up Neo4j database
- Run `python data_loader.py` to import data
- Uncomment and run the Neo4j query examples above
- Start the FastAPI server: `python main.py`
- Test the API at http://localhost:8000/docs