In [1]:
!pip install -q huggingface_hub llama-cpp-python sentence-transformers pandas faiss-cpu gradio torch


[notice] A new release of pip is available: 23.0.1 -> 25.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [15]:
import pandas as pd
import numpy as np
import faiss
import gradio as gr
from sentence_transformers import SentenceTransformer
from huggingface_hub import hf_hub_download
from llama_cpp import Llama
import os

In [None]:
model_name = "TheBloke/Mistral-7B-Instruct-v0.1-GGUF"
model_file = "mistral-7b-instruct-v0.1.Q4_K_M.gguf"

if not os.path.exists(model_file):
    print("Downloading model...")
    hf_hub_download(
        repo_id=model_name,
        filename=model_file,
        local_dir=".",
        resume_download=True
    )
else:
    print("Model already exists")


print("\nModel status:", "Found" if os.path.exists(model_file) else "Missing")

Model already exists

Model status: Found


In [17]:
try:
    df = pd.read_csv("Coursera.csv")
    print("Dataset loaded successfully!")
    print("Original columns:", df.columns.tolist())
    
    
    df = df[['Title', 'Skills', 'Ratings', 'Review counts', 'Organization']].dropna()
    df = df.rename(columns={
        'Skills': 'Course Description',
        'Ratings': 'Rating'
    })
    
   
    df['Course Description'] = df['Course Description'] + " | Offered by: " + df['Organization']
    df['Course Description'] = df['Course Description'].str[:500]
    
   
    df['Course URL'] = "#"
    df['Difficulty Level'] = "Not Specified"
    
    print("\nProcessed columns:", df.columns.tolist())
    display(df.head(2))
    
except Exception as e:
    print(f"Data loading error: {str(e)}")


Dataset loaded successfully!
Original columns: ['Unnamed: 0', 'Title', 'Organization', 'Skills', 'Ratings', 'Review counts', 'Metadata']

Processed columns: ['Title', 'Course Description', 'Rating', 'Review counts', 'Organization', 'Course URL', 'Difficulty Level']


Unnamed: 0,Title,Course Description,Rating,Review counts,Organization,Course URL,Difficulty Level
0,Google Cybersecurity,"Network Security, Python Programming, Linux, ...",4.8,4.8(20K reviews),Google,#,Not Specified
1,Google Data Analytics,"Data Analysis, R Programming, SQL, Business C...",4.8,4.8(137K reviews),Google,#,Not Specified


In [None]:
try:
    embedder = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
    embeddings = embedder.encode(df['Course Description'].tolist(), show_progress_bar=True)
    
    
    index = faiss.IndexFlatL2(embeddings.shape[1])
    index.add(np.array(embeddings).astype('float32'))
    print(f"\nFAISS index created with {index.ntotal} entries")
    
except Exception as e:
    print(f"Embedding error: {str(e)}")

Batches: 100%|██████████| 20/20 [00:11<00:00,  1.77it/s]


FAISS index created with 623 entries





In [None]:
try:
    
    llm = Llama(
        model_path=model_file,
        n_ctx=2048, 
        n_gpu_layers=20,
        verbose=False
    )
    print("LLM initialized successfully!")
    
    # Proper test prompt
    test_prompt = "<s>[INST] What is 2+2? Answer concisely. [/INST]"
    test_output = llm(
        test_prompt,
        max_tokens=20,
        temperature=0
    )
    print("Test response:", test_output['choices'][0]['text'].strip())
    
except Exception as e:
    print(f"LLM initialization failed: {str(e)}")

llama_init_from_model: n_ctx_per_seq (2048) < n_ctx_train (32768) -- the full capacity of the model will not be utilized


LLM initialized successfully!
Test response: 4


In [21]:
def recommend_courses(query, top_k=3):
    try:
        query_embed = embedder.encode([query])
        distances, indices = index.search(query_embed, top_k)
        results = df.iloc[indices[0]].copy()
        results['similarity'] = 1 - distances[0]
        return results.sort_values('similarity', ascending=False)
    except Exception as e:
        print(f"Recommendation error: {str(e)}")
        return pd.DataFrame()

# Test
test_query = "data science"
print(f"\nTesting recommendation for: '{test_query}'")
test_results = recommend_courses(test_query)
display(test_results[['Title', 'Organization', 'similarity']].head(2))


Testing recommendation for: 'data science'


Unnamed: 0,Title,Organization,similarity
115,Learn SQL Basics for Data Science,"University of California, Davis",0.291803
358,Data Science Coding Challenge: Loan Default Pr...,Coursera Project Network,0.260091


In [22]:
def generate_explanation(query, courses):
    try:
        course_list = "\n".join(
            [f"{i+1}. {row['Title']} ({row['Rating']}⭐)" 
             for i, (_, row) in enumerate(courses.iterrows())]
        )
        
        prompt = f"""<s>[INST] You are a course recommendation assistant. Given these courses:
        {course_list}
        Explain why these courses match: "{query}". Keep response under 100 words. [/INST]"""
        
        output = llm(
            prompt,
            max_tokens=150,
            temperature=0.7,
            echo=False
        )
        return output['choices'][0]['text'].strip()
    
    except Exception as e:
        print(f"Explanation error: {str(e)}")
        return "Could not generate explanation"

# Test
print("\nTesting explanation generation...")
print(generate_explanation(test_query, test_results.head(2)))


Testing explanation generation...




Both courses are related to data science because they involve working with data and using various data science techniques. The first course teaches SQL, a commonly used tool for data management and analysis in data science. The second course focuses on a specific application of data science, loan default prediction, which is a crucial problem in the financial industry and often involves working with large datasets and machine learning algorithms. Therefore, both courses are relevant to the field of data science.


In [None]:
def respond(query):
    try:
       
        courses = recommend_courses(query)
        if courses.empty:
            return "No courses found. Try different keywords."
        
        
        explanation = generate_explanation(query, courses.head(2))
        
        
        cards = []
        for _, row in courses.head(2).iterrows():
            card = f"""
            <div style='border:1px solid #ddd; padding:15px; margin:10px; border-radius:8px;'>
                <h4 style='margin:0 0 10px 0;'>{row['Title']}</h4>
                <div><b>Organization:</b> {row['Organization']}</div>
                <div><b>Rating:</b> {row['Rating']}⭐ ({row['Review counts']} reviews)</div>
                <div><b>Skills:</b> {row['Course Description'].split('|')[0]}</div>
            </div>
            """
            cards.append(card)
            
        return f"{explanation}\n" + "\n".join(cards)
    
    except Exception as e:
        return f"Error: {str(e)}"


with gr.Blocks(theme=gr.themes.Soft()) as demo:
    gr.Markdown("# 🎓 Course Recommender")
    
    with gr.Row():
        query_input = gr.Textbox(
            label="What do you want to learn?",
            placeholder="e.g., 'Python programming'",
            lines=2
        )
    
    submit_btn = gr.Button("Find Courses", variant="primary")
    
    with gr.Accordion("Examples", open=False):
        gr.Examples(
            examples=[["Machine learning"], ["Business analytics"]],
            inputs=[query_input]
        )
    
    output = gr.HTML()
    
    submit_btn.click(respond, inputs=[query_input], outputs=output)


print("\nLaunching interface...")
demo.launch(
    share=True,
    inline=False,
    debug=True  
)


Launching interface...
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://5e7b48c239e0448507.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


