<a href="https://colab.research.google.com/github/Tejasri1557/AIML-2025/blob/main/2303A51557_09.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Assessing Mathematical Learning in Higher Education
Q1: Identify the Math topic which is most hard to learn?
Q2: List the top5 math sub-topics for the students?
Q3. Identify the questions level most solved by the students?
Q4: Name the country with most solved math problems?
Q5: List top 10 keywords related to math topics most searched for?


In [25]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split


data = {
    'Topic': ['Abstract Algebra', 'Calculus', 'Differential Equations', 'Topology', 'Real Analysis', 'Linear Algebra', 'Probability and Statistics'],
    'Num_Students': [300, 500, 350, 100, 450, 400, 600],
    'Avg_Grade': [65, 85, 70, 60, 75, 80, 78],
    'Difficulty_Score': [9, 7, 8, 10, 9, 6, 5]
}


df = pd.DataFrame(data)


X = df[['Num_Students', 'Avg_Grade']]
y = df['Difficulty_Score']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


model = LinearRegression()
model.fit(X_train, y_train)


predicted_difficulty = model.predict([[500, 80]])
print(f"Predicted difficulty score: {predicted_difficulty[0]}")


Predicted difficulty score: 6.143495753240948




In [26]:
from sklearn.cluster import KMeans


sub_topics_data = {
    'Sub_Topic': ['Calculus', 'Linear Algebra', 'Probability and Statistics', 'Abstract Algebra', 'Differential Equations', 'Geometry', 'Matrix Operations', 'Number Theory'],
    'Interest_Score': [85, 90, 88, 70, 75, 65, 80, 78]
}

sub_topics_df = pd.DataFrame(sub_topics_data)


kmeans = KMeans(n_clusters=5, random_state=42)
sub_topics_df['Cluster'] = kmeans.fit_predict(sub_topics_df[['Interest_Score']])


top_5_sub_topics = sub_topics_df.sort_values(by='Interest_Score', ascending=False).head(5)
print("Top 5 math sub-topics based on interest:")
print(top_5_sub_topics[['Sub_Topic', 'Interest_Score']])


Top 5 math sub-topics based on interest:
                    Sub_Topic  Interest_Score
1              Linear Algebra              90
2  Probability and Statistics              88
0                    Calculus              85
6           Matrix Operations              80
7               Number Theory              78


In [30]:
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

data_q3 = {
    'Num_Problems_Solved': [20, 50, 80, 30, 60, 90, 15],
    'Time_Spent_Minutes': [15, 30, 45, 25, 35, 50, 20],
    'Level': ['Basic', 'Intermediate', 'Advanced', 'Basic', 'Intermediate', 'Advanced', 'Basic']
}


df_q3 = pd.DataFrame(data_q3)


encoder = LabelEncoder()
df_q3['Level_Encoded'] = encoder.fit_transform(df_q3['Level'])


X_q3 = df_q3[['Num_Problems_Solved', 'Time_Spent_Minutes']]
y_q3 = df_q3['Level_Encoded']

X_train, X_test, y_train, y_test = train_test_split(X_q3, y_q3, test_size=0.2, random_state=42)

rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)


predicted_level = rf_model.predict([[70, 40]])
predicted_level_str = encoder.inverse_transform(predicted_level)
print(f"Predicted question level: {predicted_level_str[0]}")


Predicted question level: Intermediate




In [28]:

countries_data = {
    'Country': ['China', 'United States', 'Russia', 'South Korea', 'India'],
    'Math_Problems_Solved': [1500, 1300, 1200, 1100, 900]
}

df_countries = pd.DataFrame(countries_data)


top_country = df_countries.loc[df_countries['Math_Problems_Solved'].idxmax()]
print(f"Country with most solved math problems: {top_country['Country']}")


Country with most solved math problems: China


In [31]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


search_queries = [
    "Learn Algebra",
    "Calculus basics",
    "Introduction to Linear Algebra",
    "Probability theory and statistics",
    "Differential equations for beginners",
    "Matrix operations",
    "Understanding number theory",
    "Geometry problems",
    "Real analysis for beginners",
    "Advanced calculus"
]

vectorizer = CountVectorizer(stop_words='english')
X = vectorizer.fit_transform(search_queries)


lda = LatentDirichletAllocation(n_components=1, random_state=42)
lda.fit(X)


top_keywords = np.array(vectorizer.get_feature_names_out())
keywords_scores = lda.components_[0]
top_10_keywords = top_keywords[keywords_scores.argsort()[-10:]][::-1]
print("Top 10 keywords related to math topics:")
for idx, keyword in enumerate(top_10_keywords, 1):
    print(f"{idx}. {keyword}")


Top 10 keywords related to math topics:
1. algebra
2. beginners
3. calculus
4. theory
5. understanding
6. introduction
7. analysis
8. basics
9. differential
10. equations
