In [29]:
import pandas as pd

df = pd.read_csv("datasets/study_data.csv")
df.head()

Unnamed: 0,subject,topic,text,difficulty
0,math,sets,Sets are well-defined collections of objects. ...,basic
1,math,relations and functions,A relation connects elements of one set to ano...,basic
2,math,trigonometric functions,Trigonometric functions link angles with ratio...,basic
3,math,principle of mathematical induction,Mathematical induction proves statements for a...,advanced
4,math,complex numbers and quadratic equations,Complex numbers extend real numbers using i wh...,advanced


In [None]:
df = df.dropna()

df["subject"] = df["subject"].str.strip()
df["topic"] = df["topic"].str.strip()

df["difficulty"] = df["difficulty"].str.strip().str.lower()
df["difficulty"] = df["difficulty"].replace({
    "easy": "basic",
    "medium": "advanced",
    "hard": "advanced"
})

df["text"] = df["text"].str.lower()
df["text"] = df["text"].str.replace(r'[^a-zA-Z\s]', '', regex=True)

df.head()

Unnamed: 0,subject,topic,text,difficulty
0,math,sets,sets are welldefined collections of objects no...,basic
1,math,relations and functions,a relation connects elements of one set to ano...,basic
2,math,trigonometric functions,trigonometric functions link angles with ratio...,basic
3,math,principle of mathematical induction,mathematical induction proves statements for a...,advanced
4,math,complex numbers and quadratic equations,complex numbers extend real numbers using i wh...,advanced


In [31]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
df["difficulty_label"] = le.fit_transform(df["difficulty"])

df[["difficulty", "difficulty_label"]].head()

Unnamed: 0,difficulty,difficulty_label
0,basic,1
1,basic,1
2,basic,1
3,advanced,0
4,advanced,0


In [32]:
from sklearn.model_selection import train_test_split

X_text = df["text"]
y = df["difficulty_label"]

X_train_text, X_test_text, y_train, y_test = train_test_split(
    X_text,
    y,
    test_size=0.3,
    random_state=42
)

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

vectorizer = CountVectorizer(stop_words="english")

X_train = vectorizer.fit_transform(X_train_text)
X_test = vectorizer.transform(X_test_text)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.59      0.89      0.71        18
           1       0.33      0.08      0.13        12

    accuracy                           0.57        30
   macro avg       0.46      0.49      0.42        30
weighted avg       0.49      0.57      0.48        30



In [34]:
import pickle

pickle.dump(model, open("model.pkl", "wb"))

pickle.dump(vectorizer, open("vectorizer.pkl", "wb"))

print("Saved successfully")

Saved successfully


In [35]:
loaded_model = pickle.load(open("model.pkl", "rb"))
loaded_vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

print("Loaded successfully")

Loaded successfully


In [36]:
from sklearn.feature_extraction.text import TfidfVectorizer

df["combined_text"] = (
    df["subject"] + " " +
    df["topic"] + " " +
    df["text"]
)

tfidf_vectorizer = TfidfVectorizer(
    stop_words="english",
    ngram_range=(1,2),
    max_features=2000
)

X_tfidf = tfidf_vectorizer.fit_transform(df["combined_text"])

X_dense = X_tfidf.toarray()
y_dense = df["difficulty_label"].values

In [37]:
from sklearn.cluster import KMeans

kmeans = KMeans(
    n_clusters=7,
    random_state=42,
    n_init=20
)

kmeans.fit(X_tfidf) 

df["cluster"] = kmeans.labels_

In [None]:
def get_related_topics(topic_name):
    topic_row = df[df["topic"].str.lower() == topic_name.lower()]
    
    if topic_row.empty:
        return []
    
    cluster_id = topic_row.iloc[0]["cluster"]
    
    related = df[df["cluster"] == cluster_id]["topic"].tolist()
    
    related = [t for t in related if t.lower() != topic_name.lower()]
    
    related = list(dict.fromkeys(related))
    
    return related[:3]

In [39]:
get_related_topics("units and measurements")


['system of particles and rotational motion', 'oscillations', 'waves']

In [40]:
import numpy as np

feature_names = tfidf_vectorizer.get_feature_names_out()

def extract_keywords(doc_index, top_n=5):
    row = X_tfidf[doc_index].toarray().flatten()
    top_indices = np.argsort(row)[-top_n:]
    return [feature_names[i] for i in top_indices]

print(extract_keywords(0))

['objects notation', 'intersection complement', 'intersection', 'math sets', 'sets']


In [41]:
def extract_keywords_from_text(text, top_n=5):
    text_vector = tfidf_vectorizer.transform([text])
    row = text_vector.toarray().flatten()
    feature_names = tfidf_vectorizer.get_feature_names_out()
    
    top_indices = np.argsort(row)[::-1][:top_n]
    return [feature_names[i] for i in top_indices]

In [42]:
sample_text = df[df["topic"] == "thermodynamics"]["text"].iloc[0]
print(extract_keywords_from_text(sample_text))

['lawbased', 'internal energy', 'paths require', 'process paths', 'lawbased constraints']


In [None]:
import re

IGNORE_WORDS = {
    "concept", "terms", "topic", "theory",
    "reduce", "physical", "physics",
    "chemistry", "biology", "computer",
    "science", "begins", "begin",
    "quantities", "quantity",
    "calculation", "calculations"
}

def clean_keyword(word):
    word = word.lower().strip()

    word = re.sub(r'[^a-z]', '', word)

    if len(word) < 5:
        return None
    if word in IGNORE_WORDS:
        return None
    if not re.search(r"[aeiou]", word):
        return None

    return word

def generate_study_tips(text, score=None):
    raw_keywords = extract_keywords_from_text(text)

    seen = set()
    final_keywords = []

    for phrase in raw_keywords:
        words = phrase.lower().split()

        for word in words:
            cleaned = clean_keyword(word)

            if cleaned and cleaned not in seen:
                seen.add(cleaned)
                final_keywords.append(cleaned)

    tips = [f"Revise the basic concept of {word}." for word in final_keywords]

    return tips[:5]

In [72]:
print(generate_study_tips(sample_text,4))

['Revise the basic concept of units.', 'Revise the basic concept of measurement.']


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import numpy as np

X_dense = X_tfidf.toarray()
y_dense = df["difficulty_label"].values

X_train_dl, X_test_dl, y_train_dl, y_test_dl = train_test_split(
    X_dense,
    y_dense,
    test_size=0.3,
    random_state=42
)

dl_model = Sequential([
    Dense(64, activation='relu', input_shape=(X_train_dl.shape[1],)),
    Dense(32, activation='relu'),
    Dense(2, activation='softmax')
])

dl_model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

dl_model.summary()

history = dl_model.fit(
    X_train_dl,
    y_train_dl,
    epochs=20,
    batch_size=8,
    validation_data=(X_test_dl, y_test_dl),
    verbose=1
)

loss, accuracy = dl_model.evaluate(X_test_dl, y_test_dl)
print("DL Test Accuracy:", accuracy)

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - accuracy: 0.6286 - loss: 0.6855 - val_accuracy: 0.6000 - val_loss: 0.6870
Epoch 2/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6857 - loss: 0.6299 - val_accuracy: 0.6000 - val_loss: 0.6813
Epoch 3/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 40ms/step - accuracy: 0.6857 - loss: 0.5685 - val_accuracy: 0.6000 - val_loss: 0.6775
Epoch 4/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7000 - loss: 0.4863 - val_accuracy: 0.6000 - val_loss: 0.6812
Epoch 5/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7571 - loss: 0.3983 - val_accuracy: 0.6000 - val_loss: 0.7003
Epoch 6/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.9143 - loss: 0.3058 - val_accuracy: 0.6000 - val_loss: 0.7337
Epoch 7/20
[1m9/9[0m [32m━━━━━━━━━━━━━━━━━━━━

In [None]:
import re

def clean_equations(text):
    text = re.sub(r'\([^)]*\)', '', text)
    text = re.sub(r'[^a-zA-Z0-9\s.,;!?]', '', text)
    return text

def split_sentences(text):
    sentences = re.split(r'(?<=[.!?])\s+', text)
    return [s.strip() for s in sentences if s.strip()]

def summarize_dl(text, top_n=2):
    text = clean_equations(text)
    sentences = split_sentences(text)
    
    if len(sentences) <= top_n:
        return text
    
    vectors = vectorizer.transform(sentences).toarray()
    scores = dl_model.predict(vectors)[:, 2]
    
    top_indices = np.argsort(scores)[::-1][:top_n]
    summary = ". ".join([sentences[i] for i in top_indices])
    
    return summary

In [47]:
sample_text = df[df["topic"] == "units and measurements"]["text"].iloc[0]
print(summarize_dl(sample_text))

physics begins with reliable measurement of physical quantities significant figures dimensional analysis and si units reduce calculation errors


In [48]:
embeddings_index = {}
with open("datasets/glove.6B.50d.txt", encoding="utf8") as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = vector

print("Loaded embeddings:", len(embeddings_index))

Loaded embeddings: 400000


In [49]:
motivation_bank = {
    "math": {
        "struggling": [
            "Mathematics rewards patience. Every problem you wrestle with strengthens your logic.",
            "Struggling with math means you're thinking deeply. That’s growth."
        ],
        "improving": [
            "Your mathematical reasoning is sharpening. Keep building that momentum.",
            "You’re starting to see patterns — that’s real progress."
        ]
    },
    "physics": {
        "struggling": [
            "Physics feels tough because it trains you to think differently. Stay with it.",
            "Conceptual confusion in physics is the doorway to mastery."
        ],
        "improving": [
            "You’re beginning to connect theory with intuition. That’s powerful.",
            "Your understanding of physical principles is getting stronger."
        ]
    }
}

In [50]:
def get_performance_category(score):
    if score < 40:
        return "struggling"
    elif score < 75:
        return "improving"
    else:
        return "excelling"

In [51]:
def get_sentence_embedding(sentence, embeddings_index, embedding_dim=50):
    words = sentence.lower().split()
    valid_vectors = [
        embeddings_index[word]
        for word in words
        if word in embeddings_index
    ]
    
    if not valid_vectors:
        return np.zeros(embedding_dim)
    
    return np.mean(valid_vectors, axis=0)

motivation_vectors = [
    get_sentence_embedding(sentence, embeddings_index)
    for sentence in motivation_bank
]

In [52]:
from sklearn.metrics.pairwise import cosine_similarity

def generate_motivation(context_text):
    context_vec = get_sentence_embedding(context_text, embeddings_index).reshape(1, -1)
    
    similarities = cosine_similarity(context_vec, motivation_vectors)
    
    best_index = np.argmax(similarities)
    
    return motivation_bank[best_index]

In [53]:
def generate_subject_motivation(subject, score, context_text):
    category = get_performance_category(score)
    
    if subject not in motivation_bank:
        return "Keep going. Every effort counts."
    
    candidate_sentences = motivation_bank[subject].get(category, [])
    
    if not candidate_sentences:
        return "Stay consistent. You are progressing."
    
    context_vec = get_sentence_embedding(context_text, embeddings_index).reshape(1, -1)
    
    candidate_vectors = [
        get_sentence_embedding(sentence, embeddings_index)
        for sentence in candidate_sentences
    ]
    
    similarities = cosine_similarity(context_vec, candidate_vectors)
    best_index = np.argmax(similarities)
    
    return candidate_sentences[best_index]

In [54]:
context = "I keep mixing up thermodynamics laws and get confused."
generate_subject_motivation("physics", 62, context)

'Your understanding of physical principles is getting stronger.'