<a href="https://colab.research.google.com/github/Rahul23100/RepoMindd/blob/main/RePoMind_ollama.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!curl -fsSL https://ollama.com/install.sh | sh

>>> Installing ollama to /usr/local
>>> Downloading Linux amd64 bundle
######################################################################## 100.0%
>>> Creating ollama user...
>>> Adding ollama user to video group...
>>> Adding current user to ollama group...
>>> Creating ollama systemd service...
>>> The Ollama API is now available at 127.0.0.1:11434.
>>> Install complete. Run "ollama" from the command line.


In [2]:
%%writefile app.py

import os
import re
import git
import json
import time
import base64
import requests # Used for Ollama API calls
import pandas as pd
import streamlit as st
from datetime import datetime, timedelta
from collections import Counter, defaultdict
from git.exc import GitCommandError
from pathlib import Path
import plotly.express as px
import plotly.graph_objects as go
from typing import Dict, List, Tuple, Optional

# ===============================================
# PAGE CONFIGURATION
# ===============================================

st.set_page_config(
    page_title="RepoMind AI - Advanced Repository Analyzer",
    page_icon="🧠",
    layout="wide",
    initial_sidebar_state="expanded",
    menu_items={
        'Get Help': 'https://github.com/yourusername/repomind',
        'Report a bug': "https://github.com/yourusername/repomind/issues",
        'About': "# RepoMind AI\nAdvanced AI-powered repository analyzer for understanding GitHub projects instantly."
    }
)

# ===============================================
# CUSTOM CSS STYLING
# ===============================================

st.markdown("""
<style>
    @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700&display=swap');
    .stApp { font-family: 'Inter', sans-serif; }
    .main-header {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        -webkit-background-clip: text;
        -webkit-text-fill-color: transparent;
        background-clip: text;
        font-size: 3.5em;
        font-weight: 800;
        text-align: center;
        margin-bottom: 10px;
    }
    .stButton > button {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        color: white; border: none; padding: 12px 30px; border-radius: 10px; font-weight: 600;
    }
</style>
""", unsafe_allow_html=True)

# ===============================================
# UTILITY AND ANALYSIS FUNCTIONS
# ===============================================

def get_file_list(repo_path: str) -> List[str]:
    """Get a list of all file paths in the repository for chatbot context."""
    file_list = []
    for root, dirs, files in os.walk(repo_path):
        # Skip hidden directories
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for file in files:
            if not file.startswith('.'):
                full_path = os.path.join(root, file)
                file_list.append(os.path.relpath(full_path, repo_path))
    return file_list

def get_file_extension_stats(repo_path: str) -> Dict[str, int]:
    extensions = defaultdict(int)
    for root, dirs, files in os.walk(repo_path):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for file in files:
            if not file.startswith('.'):
                ext = Path(file).suffix.lower()
                if ext: extensions[ext] += 1
    return dict(extensions)

def detect_programming_languages(extensions: Dict[str, int]) -> Dict[str, int]:
    language_map = {'.py': 'Python', '.js': 'JavaScript', '.ts': 'TypeScript', '.java': 'Java', '.cpp': 'C++', '.c': 'C', '.cs': 'C#', '.rb': 'Ruby', '.go': 'Go', '.rs': 'Rust', '.swift': 'Swift', '.kt': 'Kotlin', '.php': 'PHP', '.r': 'R', '.m': 'MATLAB', '.scala': 'Scala', '.sh': 'Shell', '.html': 'HTML', '.css': 'CSS', '.vue': 'Vue', '.jsx': 'React', '.tsx': 'TypeScript React'}
    languages = defaultdict(int)
    for ext, count in extensions.items():
        if ext in language_map: languages[language_map[ext]] += count
    return dict(languages)

def analyze_commit_patterns(repo) -> Dict:
    commits = list(repo.iter_commits(max_count=500))
    commit_hours, commit_days, contributors = Counter(), Counter(), Counter()
    commit_months = defaultdict(int)
    for commit in commits:
        dt = datetime.fromtimestamp(commit.committed_date)
        commit_hours[dt.hour] += 1
        commit_days[dt.strftime('%A')] += 1
        commit_months[dt.strftime('%Y-%m')] += 1
        contributors[commit.author.name if commit.author else "Unknown"] += 1
    recent_commits = [c for c in commits if datetime.fromtimestamp(c.committed_date) > datetime.now() - timedelta(days=30)]
    velocity = len(recent_commits) / 30 if recent_commits else 0
    return {'total_commits': len(commits), 'commit_hours': dict(commit_hours), 'commit_days': dict(commit_days), 'commit_months': dict(sorted(commit_months.items())[-12:]), 'contributors': dict(contributors.most_common(10)), 'velocity': round(velocity, 2), 'most_active_hour': max(commit_hours, key=commit_hours.get) if commit_hours else None, 'most_active_day': max(commit_days, key=commit_days.get) if commit_days else None}

def extract_readme_content(repo_path: str) -> Optional[str]:
    for readme in ['README.md', 'readme.md']:
        readme_path = os.path.join(repo_path, readme)
        if os.path.exists(readme_path):
            try:
                with open(readme_path, 'r', encoding='utf-8') as f: return f.read()[:5000]
            except: continue
    return None

def calculate_code_metrics(repo_path: str) -> Dict:
    total_lines, total_files = 0, 0
    largest_file = {"name": "", "lines": 0}
    code_extensions = {'.py', '.js', '.java', '.cpp', '.c', '.go', '.rs', '.ts', '.rb', '.php'}
    for root, dirs, files in os.walk(repo_path):
        dirs[:] = [d for d in dirs if not d.startswith('.')]
        for file in files:
            if Path(file).suffix.lower() in code_extensions:
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8', errors='ignore') as f:
                        lines = len(f.readlines())
                        total_lines += lines; total_files += 1
                        if lines > largest_file["lines"]: largest_file = {"name": os.path.relpath(file_path, repo_path), "lines": lines}
                except: continue
    return {'total_lines': total_lines, 'total_files': total_files, 'avg_lines_per_file': round(total_lines / total_files) if total_files > 0 else 0, 'largest_file': largest_file}

@st.cache_data(show_spinner=False, ttl=3600)
def clone_and_analyze_repository(repo_url: str, max_commits: int = 100) -> Tuple[Dict, Optional[str]]:
    repo_url = repo_url.strip().rstrip('/') + ('' if repo_url.endswith('.git') else '.git')
    repo_name = repo_url.split('/')[-1].replace('.git', '')
    local_path = os.path.join("/tmp", f"repomind_{repo_name}")
    try:
        if os.path.exists(local_path):
             # Simple pull, for a more robust solution, you might handle conflicts
            git.Repo(local_path).remotes.origin.pull()
        else:
            git.Repo.clone_from(repo_url, local_path, depth=max_commits)

        repo = git.Repo(local_path)
        # We fetch all commits up to the max_commits depth
        commits = list(repo.iter_commits(max_count=max_commits))
        commit_data = [{'hash': c.hexsha[:7], 'author': c.author.name if c.author else "Unknown", 'date': datetime.fromtimestamp(c.committed_date), 'message': c.message.strip()[:100]} for c in commits]
        file_stats = get_file_extension_stats(local_path)
        repo_size = sum(os.path.getsize(os.path.join(dirpath, f)) for dirpath, _, filenames in os.walk(local_path) for f in filenames) / (1024*1024)

        # NEW: Get file list for chatbot context
        file_list = get_file_list(local_path)

        return {
            'repo_name': repo_name,
            'repo_url': repo_url.replace('.git', ''),
            'commits': [c.message.strip() for c in commits],
            'commit_data': commit_data,
            'file_stats': file_stats,
            'languages': detect_programming_languages(file_stats),
            'commit_patterns': analyze_commit_patterns(repo),
            'readme': extract_readme_content(local_path),
            'code_metrics': calculate_code_metrics(local_path),
            'repo_size': round(repo_size, 2),
            'total_files': sum(file_stats.values()),
            'branch_count': len(repo.branches),
            'last_update': commit_data[0]['date'].strftime('%Y-%m-%d') if commit_data else "Unknown",
            'file_list': file_list # Added for chatbot
        }, None
    except Exception as e:
        return None, f"❌ Error: {e}. Check if the URL is correct and the repository is public."


# ===============================================
# AI AND VISUALIZATION FUNCTIONS
# ===============================================

@st.cache_data(ttl=3600)
def generate_ai_summary_ollama(analysis: Dict, ollama_endpoint: str, ollama_model: str) -> Tuple[Optional[str], Optional[str]]:
    prompt = f"You are RepoMind AI, an expert software repository analyst. Analyze this GitHub repository comprehensively.\n\nREPOSITORY: {analysis['repo_name']}\nSTATISTICS:\n- Total Files: {analysis['total_files']}\n- Repository Size: {analysis['repo_size']} MB\n- Commits: {len(analysis['commits'])}\n- Languages: {', '.join(analysis['languages'].keys())}\n\nREADME EXCERPT:\n{analysis['readme'][:1000] if analysis['readme'] else 'N/A'}\n\nBased on this, provide:\n1. **Project Purpose & Overview**\n2. **Key Features**\n3. **Technical Architecture**\n4. **Development Activity**\n5. **Recommendations**\n\nFormat in clear markdown with emojis."
    try:
        response = requests.post(f"{ollama_endpoint}/api/generate", json={"model": ollama_model, "prompt": prompt, "stream": False}, timeout=120)
        response.raise_for_status()
        return response.json().get('response'), None
    except Exception as e: return None, f"❌ AI Generation Error: {e}"

@st.cache_data(ttl=3600)
def generate_code_quality_report_ollama(analysis: Dict, ollama_endpoint: str, ollama_model: str) -> Tuple[Optional[str], Optional[str]]:
    prompt = f"As a senior software architect, analyze the code quality for the '{analysis['repo_name']}' repository.\n\nMETRICS:\n- Total Code Lines: {analysis['code_metrics']['total_lines']}\n- Avg Lines/File: {analysis['code_metrics']['avg_lines_per_file']}\n- Largest File: {analysis['code_metrics']['largest_file']['name']} ({analysis['code_metrics']['largest_file']['lines']} lines)\n\nProvide a brief code quality assessment covering:\n1. **Code Organization**\n2. **Commit Quality**\n3. **Recommendations**"
    try:
        response = requests.post(f"{ollama_endpoint}/api/generate", json={"model": ollama_model, "prompt": prompt, "stream": False}, timeout=90)
        response.raise_for_status()
        return response.json().get('response'), None
    except Exception as e: return None, f"❌ Code quality analysis error: {e}"

# NEW: Chatbot AI function
def generate_chatbot_response_ollama(question: str, history: List[Dict], analysis: Dict, ollama_endpoint: str, ollama_model: str) -> str:
    """Generates a response from the chatbot LLM."""
    context = f"""
    You are a helpful AI assistant specializing in analyzing the GitHub repository: '{analysis['repo_name']}'.
    Use the following context to answer the user's question.

    **README Summary:**
    {analysis.get('readme', 'Not available.')[:2000]}

    **File Structure (sample of files):**
    {', '.join(analysis.get('file_list', [])[:50])}

    **Main Languages:**
    {', '.join(analysis.get('languages', {}).keys())}
    """

    # Simple history formatting
    history_str = "\n".join([f"{msg['role']}: {msg['content']}" for msg in history])

    prompt = f"{context}\n\n--- Conversation History ---\n{history_str}\n\n--- New Question ---\nuser: {question}\nassistant:"

    try:
        response = requests.post(f"{ollama_endpoint}/api/generate", json={"model": ollama_model, "prompt": prompt, "stream": False}, timeout=120)
        response.raise_for_status()
        return response.json().get('response', "Sorry, I couldn't generate a response.")
    except Exception as e:
        return f"Sorry, an error occurred: {e}"

def create_language_chart(languages: Dict) -> go.Figure:
    if not languages: return None
    fig = go.Figure(data=[go.Pie(labels=list(languages.keys()), values=list(languages.values()), hole=0.3)])
    fig.update_layout(title="Languages", showlegend=True, height=400)
    return fig

def create_commit_timeline(commit_months: Dict) -> go.Figure:
    if not commit_months: return None
    fig = go.Figure(data=[go.Scatter(x=list(commit_months.keys()), y=list(commit_months.values()), mode='lines+markers')])
    fig.update_layout(title="Commit Activity Timeline", height=350)
    return fig

# ===============================================
# MAIN STREAMLIT APPLICATION
# ===============================================

def main():
    display_header()
    if 'repo_url' not in st.session_state: st.session_state.repo_url = ""
    if 'analysis' not in st.session_state: st.session_state.analysis = None

    with st.sidebar:
        st.markdown("## ⚙️ Configuration")
        ollama_endpoint = "http://localhost:11434"
        ollama_model = st.text_input("Ollama Model Name", value="llama3", help="Ensure this model is pulled in the Colab setup.")
        st.info(f"Ollama Endpoint is fixed to `{ollama_endpoint}` for Colab.")
        st.markdown("---")
        st.markdown("## 🎛️ Analysis Settings")
        max_commits = st.slider("Commits to Analyze", 50, 500, 100, 50, help="Number of recent commits to fetch and analyze.")
        enable_visualizations = st.checkbox("Enable Visualizations", value=True)
        enable_code_quality = st.checkbox("Code Quality Report", value=True)

    st.markdown("## 🔍 Analyze Repository")
    st.text_input("GitHub Repository URL", key="url_input_widget", value=st.session_state.repo_url, on_change=lambda: setattr(st.session_state, 'repo_url', st.session_state.url_input_widget))
    analyze_button = st.button("🚀 Analyze", type="primary")

    if analyze_button:
        if not st.session_state.repo_url: st.warning("Please enter a repository URL.")
        else:
            with st.spinner("Cloning and analyzing repository... This may take a moment."):
                analysis, error = clone_and_analyze_repository(st.session_state.repo_url, max_commits)
            if error: st.error(error); st.session_state.analysis = None
            else: st.session_state.analysis = analysis; st.session_state.messages = [] # Reset chat on new analysis

    if st.session_state.analysis:
        analysis = st.session_state.analysis
        st.markdown(f"## 📊 Analysis for `{analysis['repo_name']}`")

        # UPDATED: Added a new Chatbot tab
        tab1, tab2, tab3, tab4 = st.tabs(["🤖 AI Summary", "📈 Code & Commits", "💬 Chat with Repo", "📖 README"])

        with tab1:
            with st.spinner(f"🧠 Generating AI summary with {ollama_model}..."):
                summary, error = generate_ai_summary_ollama(analysis, ollama_endpoint, ollama_model)
                if error: st.error(error)
                else: st.markdown(summary)
            if enable_code_quality:
                with st.spinner("🔬 Assessing code quality..."):
                    report, error = generate_code_quality_report_ollama(analysis, ollama_endpoint, ollama_model)
                    if error: st.error(error)
                    else: st.markdown("---"); st.markdown(report)

        with tab2:
            st.markdown(f"### 📜 Full Commit History (Last {len(analysis['commit_data'])} Commits)")
            if analysis['commit_data']:
                # UPDATED: Displaying the full commit dataframe, not just top 20
                df = pd.DataFrame(analysis['commit_data'])
                df['date'] = pd.to_datetime(df['date']).dt.strftime('%Y-%m-%d %H:%M')
                st.dataframe(df, use_container_width=True, hide_index=True)
            if enable_visualizations:
                st.plotly_chart(create_commit_timeline(analysis['commit_patterns']['commit_months']), use_container_width=True)
                st.plotly_chart(create_language_chart(analysis['languages']), use_container_width=True)

        # NEW: Chatbot tab implementation
        with tab3:
            st.markdown("### 💬 Ask Questions About This Repository")
            if "messages" not in st.session_state:
                st.session_state.messages = []

            for message in st.session_state.messages:
                with st.chat_message(message["role"]):
                    st.markdown(message["content"])

            if prompt := st.chat_input("What is the main purpose of this project?"):
                st.session_state.messages.append({"role": "user", "content": prompt})
                with st.chat_message("user"):
                    st.markdown(prompt)

                with st.chat_message("assistant"):
                    with st.spinner("Thinking..."):
                        response = generate_chatbot_response_ollama(prompt, st.session_state.messages, analysis, ollama_endpoint, ollama_model)
                        st.markdown(response)
                st.session_state.messages.append({"role": "assistant", "content": response})

        with tab4:
            st.markdown(analysis['readme'] or "No README found.")

def display_header():
    st.markdown('<h1 class="main-header">🧠 RepoMind AI on Colab</h1>', unsafe_allow_html=True)

if __name__ == "__main__":
    main()

Writing app.py


In [3]:
!pip install streamlit pyngrok pandas gitpython plotly requests -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.1/10.1 MB[0m [31m59.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m91.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [9]:
import os
import asyncio

# Start Ollama server as a background process
async def run_ollama_serve():
    process = await asyncio.create_subprocess_shell(
        "ollama serve",
        stdout=asyncio.subprocess.PIPE,
        stderr=asyncio.subprocess.PIPE
    )
    print("Ollama server started.")
    # The server will run in the background
    # We don't await process.wait() because we want it to run indefinitely

# Run the async function
try:
    loop = asyncio.get_running_loop()
except RuntimeError:  # 'RuntimeError: There is no current event loop...'
    loop = asyncio.new_event_loop()
    asyncio.set_event_loop(loop)

loop.create_task(run_ollama_serve())

# Give the server a moment to start up
print("Waiting for Ollama server to start...")
!sleep 5

# Pull the model
print("Pulling the Llama3 model. This will take a few minutes...")
!ollama pull llama3
print("Model pull complete!")

Waiting for Ollama server to start...
Pulling the Llama3 model. This will take a few minutes...
Error: ollama server not responding - could not connect to ollama server, run 'ollama serve' to start it
Model pull complete!


In [5]:
!pip install pyngrok



In [11]:
pip install streamlit



In [None]:
!ngrok http 8501 --region in

Flag --region has been deprecated, ngrok automatically chooses the region with lowest latency


In [None]:
# Replace with your actual ngrok authtoken
NGROK_AUTHTOKEN = "33Yilzhz7P2407iVh5u1WuBHuFx_7yrwB8zBDurH75eqWtkZm"

from pyngrok import ngrok, conf
import os

# Set ngrok authtoken
if NGROK_AUTHTOKEN:
    ngrok.set_auth_token(NGROK_AUTHTOKEN)
    print("Ngrok authtoken set.")
else:
    print("WARNING: Ngrok authtoken not set. Public URL will be temporary.")

# Get the public URL from ngrok
public_url = ngrok.connect(8501)
print(f"🚀 Your Streamlit app is live at: {public_url}")

# Run the Streamlit app
# The '!' runs this as a shell command.
# The '&' runs it in the background.
!streamlit run app.py &