In [2]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os
import warnings

# Suppress specific warning
warnings.filterwarnings('ignore', message="The parameter 'token_pattern' will not be used since 'tokenizer' is not None'")

# Load data
students_df = pd.read_csv('student_data.csv')
catalog_df = pd.read_csv('course_catalog.csv')

# Define a tokenizer function
def custom_tokenizer(text):
    return text.split(', ')

# Vectorization of subjects and interests
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

# Create feature vectors
student_vectors = vectorizer.fit_transform(students_df['subjects'] + ', ' + students_df['interests'])
course_vectors = vectorizer.transform(catalog_df['subjects'] + ', ' + catalog_df['skills'])

# Save the vectorizer to a file
joblib.dump(vectorizer, 'vectorizer.joblib')

# Calculate cosine similarity
similarity_matrix = cosine_similarity(student_vectors, course_vectors)

# Generate Recommend courses for each student
recommendations = {}
for i, student_id in enumerate(students_df['student_id']):
    similar_courses = similarity_matrix[i].argsort()[::-1]  # Sort courses by similarity
    recommended_courses = catalog_df.iloc[similar_courses[:5]]['course_name'].tolist()  # Top 5 courses
    recommendations[student_id] = recommended_courses

# Save the similarity matrix (model) to a file
joblib.dump(similarity_matrix, 'similarity_matrix.joblib')

# Save the recommendations to a CSV file
recommendations_df = pd.DataFrame.from_dict(recommendations, orient='index')
recommendations_df.to_csv('recommendations.csv')

print("Training completed and models saved!")


Training completed and models saved!


In [12]:
# Sample new student data
new_students = pd.DataFrame({
    'student_id': [101, 102],
    'subjects': ['Math, English, Physics', 'History ,Kiswahili, Geography'],
    'interests': ['Astronomy, Quantum Mechanics', 'Politics, Cultural Studies']
})

# Vectorize the new student's data (subjects + interests)
new_student_vectors = vectorizer.transform(new_students['subjects'] + ', ' + new_students['interests'])

# Calculate cosine similarity between new students and courses
new_similarity_matrix = cosine_similarity(new_student_vectors, course_vectors)

# Generate recommendations for new students
new_recommendations = {}
for i, student_id in enumerate(new_students['student_id']):
    similar_courses = new_similarity_matrix[i].argsort()[::-1]  # Sort courses by similarity
    recommended_courses = catalog_df.iloc[similar_courses[:5]]['course_name'].tolist()  # Top 5 courses
    new_recommendations[student_id] = recommended_courses

# Display recommendations for the new students
for student_id, courses in new_recommendations.items():
    print(f"New Student {student_id} is recommended the following courses: {', '.join(courses)}")


New Student 101 is recommended the following courses: Bachelor of Arts in Journalism, Bachelor of Arts in Journalism, Bachelor of Arts in Literature, Bachelor of Arts in Journalism, Bachelor of Arts in Literature
New Student 102 is recommended the following courses: Bachelor of Arts in Urban Planning, Bachelor of Arts in Urban Planning, Bachelor of Arts in Urban Planning, Bachelor of Arts in International Relations, Bachelor of Arts in Urban Planning


In [4]:
import pickle

# Save vectorizer and course vectors to disk
with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

with open('course_vectors.pkl', 'wb') as course_file:
    pickle.dump(course_vectors, course_file)

# Later, load the vectorizer and course vectors to make new predictions
with open('vectorizer.pkl', 'rb') as vec_file:
    loaded_vectorizer = pickle.load(vec_file)

with open('course_vectors.pkl', 'rb') as course_file:
    loaded_course_vectors = pickle.load(course_file)

# Test the loaded model with new student data
new_student_vectors = loaded_vectorizer.transform(new_students['subjects'] + ', ' + new_students['interests'])
new_similarity_matrix = cosine_similarity(new_student_vectors, loaded_course_vectors)

# Make predictions for the new students (same as above)


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import warnings

# supress specific warning
warnings.filterwarnings('ignore', message="The parameter 'token_pattern' will not be used since 'tokenizer' is not None'")


# Load data
students_df = pd.read_csv('student_data.csv')
catalog_df = pd.read_csv('course_catalog.csv')

# Vectorization of subjects and interests
vectorizer = CountVectorizer(tokenizer=lambda x: x.split(', '))

# Create feature vectors
student_vectors = vectorizer.fit_transform(students_df['subjects'] + ', ' + students_df['interests'])
course_vectors = vectorizer.transform(catalog_df['subjects'] + ', ' + catalog_df['skills'])

# Calculate cosine similarity
similarity_matrix = cosine_similarity(student_vectors, course_vectors)


# Generate Recommend courses for each student
recommendations = {}
for i, student_id in enumerate(students_df['student_id']):
    similar_courses = similarity_matrix[i].argsort()[::-1]  # Sort courses by similarity
    recommended_courses = catalog_df.iloc[similar_courses[:5]]['course_name'].tolist()  # Top 5 courses
    recommendations[student_id] = recommended_courses

# Display recommendations
for student_id, courses in recommendations.items():
    print(f"Student {student_id} is recommended the following courses: {', '.join(courses)}")


Split the Data: Separate your data into training and testing sets.

Train the Model: Create the recommendation model using the training set.

Save the Model: Save the model using a library like joblib or pickle.

Load the Model: Load the saved model for making predictions.

Make Predictions: Use the loaded model to generate recommendations.

In [11]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import joblib

# Define a tokenizer function
def custom_tokenizer(text):
    return text.split(', ')

# Load data
students_df = pd.read_csv('student_data.csv')
catalog_df = pd.read_csv('course_catalog.csv')

# Vectorization of subjects and interests
vectorizer = CountVectorizer(tokenizer=custom_tokenizer)

# Create feature vectors
student_vectors = vectorizer.fit_transform(students_df['subjects'] + ', ' + students_df['interests'])
course_vectors = vectorizer.transform(catalog_df['subjects'] + ', ' + catalog_df['skills'])

# Split the student data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(students_df, student_vectors, test_size=0.2, random_state=42)

# Calculate cosine similarity for the training set
similarity_matrix_train = cosine_similarity(y_train, course_vectors)

# Generate recommendations for the training set
recommendations_train = {}
for i, student_id in enumerate(X_train['student_id']):  # Use only training set
    similar_courses = similarity_matrix_train[i].argsort()[::-1]  # Sort courses by similarity
    recommended_courses = catalog_df.iloc[similar_courses[:5]]['course_name'].tolist()  # Top 5 courses
    recommendations_train[student_id] = recommended_courses

# Save the vectorizer and recommendations model
joblib.dump((vectorizer, recommendations_train), 'course_recommendation_model.pkl')

# Load the model
loaded_vectorizer, loaded_recommendations = joblib.load('course_recommendation_model.pkl')

# Now let's predict for the test set
similarity_matrix_test = cosine_similarity(X_test, course_vectors)

# Generate recommendations for the test set
recommendations_test = {}
for i, student_id in enumerate(X_test['student_id']):  # Use only test set
    similar_courses = similarity_matrix_test[i].argsort()[::-1]  # Sort courses by similarity
    recommended_courses = catalog_df.iloc[similar_courses[:5]]['course_name'].tolist()  # Top 5 courses
    recommendations_test[student_id] = recommended_courses

# Display test recommendations
for student_id, courses in recommendations_test.items():
    print(f"Student {student_id} is recommended the following courses: {', '.join(courses)}")


ValueError: could not convert string to float: 'S522'

In [1]:
import pandas as pd

# Step 1: Read the CSV file into a pandas DataFrame
df = pd.read_csv('student_data.csv')

# Step 2: Drop duplicate rows based on all columns
df_cleaned = df.drop_duplicates()

# Step 3: Save the cleaned DataFrame back to a CSV file
df_cleaned.to_csv('student_data_cleaned.csv', index=False)