In [1]:
import pandas as pd

In [14]:
%autosave 60

Autosaving every 60 seconds


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Load dataset
df = pd.read_csv("all_merged(no dups).csv")

X = df["user_query"]
y = df["command"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

# Pipeline: TF-IDF + Logistic Regression
model = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("clf", LogisticRegression(max_iter=3000))
])

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))


                                                                                                            precision    recall  f1-score   support

                                                                aa-complain /etc/apparmor.d/usr.sbin.nginx       1.00      1.00      1.00         2
                                                                 aa-enforce /etc/apparmor.d/usr.sbin.nginx       0.60      1.00      0.75         3
                                                                                                 aa-status       0.50      1.00      0.67         2
                                                                              addgroup --gid 2000 newgroup       1.00      1.00      1.00         3
                                                                    adduser --home /home/testuser testuser       1.00      1.00      1.00         3
                                                                                           adduser newuser     

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [15]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# ----------------------------
# 1. Load your dataset
# ----------------------------
# Example CSV structure: user_query,command,description
df = pd.read_csv("all_merged(no dups).csv")

# Unique command set
commands = df['command'].unique().tolist()

# ----------------------------
# 2. Load a Sentence-BERT model
# ----------------------------
# all-MiniLM-L6-v2 is lightweight and works well for semantic similarity
model = SentenceTransformer('all-MiniLM-L6-v2')

# ----------------------------
# 3. Encode all commands once
# ----------------------------
command_embeddings = model.encode(commands, convert_to_tensor=True)

# ----------------------------
# 4. Function: get best match
# ----------------------------
def get_best_command(query, top_k=3):
    query_embedding = model.encode(query, convert_to_tensor=True)
    
    # Compute cosine similarity
    cosine_scores = util.pytorch_cos_sim(query_embedding, command_embeddings)[0]
    
    # Get top_k results
    top_results = torch.topk(cosine_scores, k=top_k)
    
    matches = []
    for score, idx in zip(top_results[0], top_results[1]):
        matches.append({
            "command": commands[idx],
            "score": float(score)
        })
    return matches

# ----------------------------
# 5. Test with new queries
# ----------------------------
test_queries = [
    "show me all files in this folder with details",
    "open file.txt in read only mode using vim",
    "how to refresh htop every half second",
    "remove unused dependencies in fedora"
]

for q in test_queries:
    print(f"\nQuery: {q}")
    results = get_best_command(q)
    for r in results:
        print(f"  → {r['command']} (score={r['score']:.4f})")



Query: show me all files in this folder with details
  → dirs -c (score=0.4365)
  → dirs -p (score=0.4249)
  → dirs -v (score=0.3922)

Query: open file.txt in read only mode using vim
  → vim file.txt (score=0.7027)
  → vim -u NONE file.txt (score=0.6749)
  → vim -R file.txt (score=0.6727)

Query: how to refresh htop every half second
  → htop (score=0.5557)
  → htop -u user (score=0.5180)
  → htop -t (score=0.5085)

Query: remove unused dependencies in fedora
  → apt autoremove (score=0.4001)
  → systemctl list-dependencies nginx (score=0.3789)
  → dnf autoremove (score=0.3586)


In [25]:
# import pandas as pd
# import torch
# from sentence_transformers import SentenceTransformer, util

# # Load dataset
# df = pd.read_csv("all_merged(no_dups).csv")

# # Instead of just commands, combine command + description
# df['cmd_text'] = df['command'] + " : " + df['description']

# # Unique targets
# cmd_texts = df['cmd_text'].unique().tolist()

# # Load a better semantic model (optimized for Q&A/retrieval)
# model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# # Encode commands+descriptions
# cmd_embeddings = model.encode(cmd_texts, convert_to_tensor=True)

# def get_best_command(query, top_k=3):
#     query_embedding = model.encode(query, convert_to_tensor=True)
#     scores = util.cos_sim(query_embedding, cmd_embeddings)[0]
#     top_results = torch.topk(scores, k=top_k)
    
#     matches = []
#     for score, idx in zip(top_results[0], top_results[1]):
#         matches.append({
#             "cmd_text": cmd_texts[idx],
#             "score": float(score)
#         })
#     return matches

# # Test
# queries = [
#     "show me all files in this folder with details",
#     "open file.txt in read only mode using vim",
#     "how to refresh htop every half second",
#     "remove unused dependencies in fedora"
# ]

# for q in queries:
#     print(f"\nQuery: {q}")
#     results = get_best_command(q)
#     for r in results:
#         print(f"  → {r['cmd_text']} (score={r['score']:.4f})")


In [22]:
import pandas as pd
import torch
from sentence_transformers import SentenceTransformer, util

# Load dataset
df = pd.read_csv("commands.csv")

# Use user queries directly
queries_list = df['user_query'].tolist()
commands_list = df['command'].tolist()

# Load semantic model
model = SentenceTransformer('multi-qa-mpnet-base-dot-v1')

# Encode all queries
query_embeddings = model.encode(queries_list, convert_to_tensor=True)

def get_best_command(user_input, top_k=3):
    input_emb = model.encode(user_input, convert_to_tensor=True)
    scores = util.cos_sim(input_emb, query_embeddings)[0]
    top_results = torch.topk(scores, k=top_k)
    
    matches = []
    for score, idx in zip(top_results[0], top_results[1]):
        matches.append({
            "user_query": queries_list[idx],
            "command": commands_list[idx],
            "score": float(score)
        })
    return matches

# Test
test_queries = [
    "show me all files in this folder with details",
    "open file.txt in read only mode using vim",
    "how to refresh htop every half second",
    "remove unused dependencies in fedora"
]

for q in test_queries:
    print(f"\nQuery: {q}")
    results = get_best_command(q)
    for r in results:
        print(f"  → {r['command']} (matched '{r['user_query']}') score={r['score']:.4f}")



Query: show me all files in this folder with details
  → ls -d */ (matched 'Show me only the directories in this folder') score=0.7997
  → ls -R (matched 'Show me all files and directories, including those nested within others') score=0.7784
  → ls -la (matched 'I need to see everything in this folder, including dot files, with their full attributes') score=0.7776

Query: open file.txt in read only mode using vim
  → vim -R file.txt (matched 'Open file.txt in Vim in read-only mode') score=0.9893
  → vim -R file.txt (matched 'Please open file.txt using Vim in read-only mode') score=0.9741
  → vim -R file.txt (matched 'How do I open `file.txt` with Vim in read-only mode?') score=0.9665

Query: how to refresh htop every half second
  → htop -d 5 (matched 'How do I run `htop` so it refreshes every 0.5 seconds?') score=0.9170
  → htop -d 5 (matched 'How do I launch htop so it updates every half second?') score=0.8931
  → htop -d 5 (matched 'How can I get `htop` to refresh twice a second?')

In [23]:
def get_best_command(user_input, top_k=3):
    input_emb = model.encode(user_input, convert_to_tensor=True)
    scores = util.cos_sim(input_emb, query_embeddings)[0]
    top_results = torch.topk(scores, k=len(scores))  # check all scores

    seen_commands = set()
    matches = []

    for score, idx in zip(top_results[0], top_results[1]):
        cmd = commands_list[idx]
        if cmd not in seen_commands:
            matches.append({
                "user_query": queries_list[idx],
                "command": cmd,
                "score": float(score)
            })
            seen_commands.add(cmd)
        if len(matches) >= top_k:
            break

    return matches


In [24]:
# Test
test_queries = [
    "show me all files in this folder with details",
    "open file.txt in read only mode using vim",
    "how to refresh htop every half second",
    "remove unused dependencies in fedora"
]

for q in test_queries:
    print(f"\nQuery: {q}")
    results = get_best_command(q)
    for r in results:
        print(f"  → {r['command']} (matched '{r['user_query']}') score={r['score']:.4f}")



Query: show me all files in this folder with details
  → ls -d */ (matched 'Show me only the directories in this folder') score=0.7997
  → ls -R (matched 'Show me all files and directories, including those nested within others') score=0.7784
  → ls -la (matched 'I need to see everything in this folder, including dot files, with their full attributes') score=0.7776

Query: open file.txt in read only mode using vim
  → vim -R file.txt (matched 'Open file.txt in Vim in read-only mode') score=0.9893
  → vim file.txt (matched 'Please open file.txt using Vim') score=0.8068
  → vim -u NONE file.txt (matched 'Run Vim on `file.txt` in a clean mode, ignoring `.vimrc`') score=0.7464

Query: how to refresh htop every half second
  → htop -d 5 (matched 'How do I run `htop` so it refreshes every 0.5 seconds?') score=0.9170
  → htop (matched 'Execute the htop command') score=0.6853
  → htop -u user (matched 'Monitor processes for 'user' using `htop`') score=0.6008

Query: remove unused dependencies 

In [26]:
model.save("saved_model")        # save


In [27]:
torch.save(query_embeddings, "query_embeddings.pt")  # save



In [28]:

torch.save(commands_list, "commands_list.pt")   # save the list too