# Course Codes  
BABS_V 506 BA1 BA2 2024W1 Analyzing and Modelling Uncertainty = 147283  
BABS_V 507 BA1 BA2 2024W1 Descriptive and Predictive Business Analysis = 145559  
BAMS_V 506 BA1 BA2 2024W1 Optimal Decision Making I = 148028
BAIT_V 508 BA1 BA2 2024W1 Business Analytics Programming = 147623  
Learning Analytics hackathon =161721

In [None]:
from datetime import datetime
import dotenv
import os
import canvasapi
from bs4 import BeautifulSoup
import pandas as pd

dotenv.load_dotenv(dotenv.find_dotenv())

TOKEN = os.environ.get('CANVAS_API_TOKEN')
BASEURL = 'https://ubc.instructure.com'

canvas_api = canvasapi.Canvas(BASEURL, TOKEN)

# Collect Course information using Canvas API

In [None]:
course_ids = [161721, 147283, 145559, 148028, 147623]  # Replace with your actual course IDs

# Initialize a list to collect course data
course_data = []

# Iterate through each course ID
for course_id in course_ids:
    course = canvas_api.get_course(course_id)
    
    # Course Details
    course_name = course.name
    course_start_date = course.start_at
    course_end_date = course.end_at

    # Formatting dates
    if course_start_date:
        course_start_date = datetime.strptime(course_start_date, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")
    if course_end_date:
        course_end_date = datetime.strptime(course_end_date, "%Y-%m-%dT%H:%M:%SZ").strftime("%Y-%m-%d")

    # Get all students in the course
    students = course.get_users(enrollment_type=['student'])

    # Count the total number of students
    student_count = len(list(students))

    # Append course information to the list
    course_data.append({
        'Course ID': course_id,
        'Course Title': course_name,
        'Course Start Date': course_start_date if course_start_date else 'N/A',
        'Course End Date': course_end_date if course_end_date else 'N/A',
        'Total Number of Registered Students': student_count
    })

# Create a DataFrame from the collected course data
course_df = pd.DataFrame(course_data)

# Display the DataFrame
print(course_df)
course_df.to_csv('Course_data.csv', index=False)


# Collecting Discussion information

In [None]:
discussion_data = []

# Iterate through each course ID
for course_id in course_ids:
    course = canvas_api.get_course(course_id)
    
    # Fetch course details
    course_name = course.name
    
    # Fetch discussions in the course
    discussions = course.get_discussion_topics()

    # Get all users in the course to map their IDs to enrollment types
    users = {user.id: user for user in course.get_users()}
    
    # Get enrollment info for each user in the course
    enrollments = {enrollment.user_id: enrollment.type for enrollment in course.get_enrollments()}

    for discussion in discussions:
        discussion_id = discussion.id
        discussion_title = discussion.title
        discussion_content = discussion.message
        soup = BeautifulSoup(discussion_content, "html.parser")
        discussion_content = soup.get_text()
        # Fetch entries related to the discussion and convert to list
        entries = list(discussion.get_topic_entries())

        # Count replies to the discussion (total entries minus the main discussion entry)
        reply_count = len(entries) if entries else 0  # Ensure reply count is 0 if there are no entries

        # Iterate over each entry to fetch user ID and role
        for entry in entries:
            entry_id = entry.id
            entry_content= entry.message
            soup = BeautifulSoup(entry_content, "html.parser")
            entry_content = soup.get_text()
            
            entry_user_id = entry.user_id  # Get the user ID of the entry owner
            
            # Get the enrollment type (role) based on the user ID
            user_role = enrollments.get(entry_user_id, None)  # Default to None if not found
            replies = list(entry.get_replies())  # Get replies for the entry
            entry_reply_count = len(replies)
            print("Entry replies",reply_count)
            
            combined_message = f"{discussion_title}\n{entry_content}"  # Start with the entry message

            for reply in replies:
                reply_message=reply.message
                soup = BeautifulSoup(reply_message, "html.parser")
                reply_message = soup.get_text()
                combined_message += f"\n{reply_message}"
            print("Combined message",combined_message)
            
            # Append the discussion information for each entry to the list
            discussion_data.append({
                'Course ID': course_id,
                'Course Name': course_name,
                'Discussion ID': discussion_id,
                'Discussion Title': discussion_title,
                'Discussion Content': discussion_content,
                'Number of Total Entries': reply_count,
                'Entry ID': entry_id,
                'Entry Content':   entry_content,     
                'User ID of Entry Owner': entry_user_id,
                'Role of Entry Owner': user_role,
                'Number of Entry replies': entry_reply_count,
                'Combined Message': combined_message
            })

# Create a DataFrame from the collected discussion data
discussion_df = pd.DataFrame(discussion_data)

# Display the DataFrame
print(discussion_df)
discussion_df.to_csv('final_entries_data.csv', index=False)

# Text Preprocessing 

In [None]:
# Group by Discussion ID to aggregate the results
final_df = discussion_df.groupby(
    ['Course ID', 'Course Name', 'Discussion ID', 'Discussion Title', 'Discussion Content'],
    as_index=False
).agg(
    Number_of_Entry_Replies=('Number of Entry replies', 'sum'),
    Combined_Message=('Combined Message', ' '.join),  # Concatenate messages
    Max_Depth_of_Entry=('Number of Entry replies', 'max')  # Get max replies for each entry
)

# Display the final DataFrame
print(final_df)

import string
from nltk.corpus import stopwords

translator = str.maketrans('', '', string.punctuation)
sw = stopwords.words('english')

def clean_text(text):
    ''' This function takes a string as input and 
        returns a cleaned version of the string
        Specifically, it makes the string into lower case and remove punctuations
    '''
    text_lower = text.lower() # make it lowercase
    text_no_punctuation = text_lower.translate(translator) # remove punctuation  
    clean_words = [w for w in text_no_punctuation.split() if w not in sw] # remove stopwords
    return ' '.join(clean_words)
final_df['Cleaned_combined'] = final_df['Combined_Message'].apply(clean_text)
final_df.to_csv('Cleaned_final_entries_data.csv', index=False)

# Keywords using TF-IDF

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
course_df = pd.read_csv("course_keywords_entries_data.csv", encoding = 'utf-8')

def get_keywords_tfidf(document_list):
    '''
    This function gets a list of documents as input and returns a list of top 10 keywords for each document using TF-IDF scores.
    Input: A list of documents (text)
    Output: The corresponding top 10 keywords for each document based on tf-idf values
    '''
    vectorizer = TfidfVectorizer() # Step 1: Create a TF-IDF vectorizer
    tfidf_matrix = vectorizer.fit_transform(document_list) # Step 2: Calculate the TF-IDF matrix
    feature_names = vectorizer.get_feature_names_out() # Step 3: Get feature names (words)

    # Step 4: Extract top 10 keywords for each document
    top_keywords = [] # accumulator
    for i in range(len(document_list)):
        feature_index = tfidf_matrix[i, :].nonzero()[1]
        #print("feature index",feature_index)
        feature_value = [tfidf_matrix[i, x] for x in feature_index]
        #print("Feature value",feature_value)
        tfidf_scores = zip(feature_index, feature_value)
        sorted_tfidf_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
        #print(sorted_tfidf_scores)
        top_keywords.append(' '.join([feature_names[i] for i, _ in sorted_tfidf_scores[:10]]))

        if i % 200 == 199:
            print(f'Processed {i+1}/{len(document_list)} documents.')
    
    return top_keywords

course_df["course_keywords"]=get_keywords_tfidf(course_df['Course_message'].tolist())
print(course_df["course_keywords"])
#course_df.to_csv('course_keywords_final_entries_data.csv', index=False)



# Analysis- Semantic Similarity using Word2Vec 

In [None]:
from gensim.models import Word2Vec
import pandas as pd

df = pd.read_csv("keywords_data.csv", encoding='utf-8')
docs = [row.split() for row in df['Cleaned_combined']]

# training word2vec model using the list of words in docs
model = Word2Vec(docs, min_count=5, vector_size=50, workers=3, window=5, sg = 1)
# save the model for future use; you don't need to train Word2Vec for multiple times
model.save("word2vec_proj.model")
# load model from stored file
model = Word2Vec.load("word2vec_proj.model")

#new_entry= "Fries better with ketchup than with mustard"
## Relevant keywords---> services/ technology / Business / engineering
words_to_check = ["mustard", "ketchup", "fries"]
results = []
# Find top 5 similar words for each specified word
for word in words_to_check:
    top_similar_words = model.wv.most_similar(word, topn=2)  # Getting the top 5 similar words
    print(f"Top 5 words similar to {word}:")
    for similar_word, score in top_similar_words:
        print(f"{similar_word}: {score:.4f}")
        results.append({
            'Input Word': word,
            'Similar Word': similar_word,
            'Similarity Score': score
        })
    print("\n")

# Create a DataFrame from the results
similarity_df = pd.DataFrame(results)

# Display the DataFrame
print(similarity_df)

# Analysis- Wordcloud (Discussion within the Course)

In [None]:
def plot_wordcloud(keywords, title):
    wordcloud = WordCloud(width=800, height=400, max_font_size=100, background_color='white').generate(' '.join(keywords))
    plt.figure(figsize=(10,5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title(title)
    plt.show()
df=df[df['Course ID']==161721]
print(df)
# Generate a word cloud for each discussion
for index, row in df.iterrows():
    plot_wordcloud(row['top_keyword_tfidf'].split(), row['Discussion Title'])

# Analysis- Network Graph (Across Courses)

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import matplotlib.patches as mpatches

# Load your data
df = pd.read_csv("keywords_data.csv", encoding='utf-8')

# Step 2: Compute TF-IDF and Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['Cleaned_combined'])
similarity_matrix = cosine_similarity(tfidf_matrix)

# Step 3: Create a NetworkX graph
threshold = 0.2  # Set a threshold for similarity to consider an edge
G = nx.Graph()

# Create a color map for each course ID
course_ids = df['Course ID'].unique()  # Get unique course IDs
colors = plt.cm.get_cmap('tab10', len(course_ids))  # Create a colormap
course_color_map = {course_id: colors(i) for i, course_id in enumerate(course_ids)}  # Map course IDs to colors

# Add nodes and edges based on the similarity matrix
for i in range(similarity_matrix.shape[0]):
    course_id = df['Course ID'].iloc[i]  # Get the course ID for the current discussion
    discussion_title = df['Discussion Title'].iloc[i]  # Get the discussion title
    G.add_node(i, course_id=course_id, title=discussion_title)  # Store course ID and title as node attributes
    for j in range(i + 1, similarity_matrix.shape[1]):
        if similarity_matrix[i][j] > threshold:  # Check if similarity exceeds threshold
            G.add_edge(i, j, weight=similarity_matrix[i][j])

# Step 4: Draw the network diagram
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)  # Use a layout for better visualization

# Assign colors based on course ID
node_colors = [course_color_map[G.nodes[i]['course_id']] for i in G.nodes]

nx.draw_networkx_nodes(G, pos, node_size=700, node_color=node_colors)
nx.draw_networkx_edges(G, pos, width=1.0, alpha=0.5)

# Add labels with small font size
labels = {i: G.nodes[i]['title'] for i in G.nodes}  # Create a dictionary for labels
nx.draw_networkx_labels(G, pos, labels, font_size=8)  # Adjust font_size for smaller labels

# Create a legend for course colors
legend_handles = [mpatches.Patch(color=course_color_map[course_id], label=f'Course ID: {course_id}') for course_id in course_ids]

# Add legend to the plot
plt.legend(handles=legend_handles, title="Courses", loc='upper right')

plt.title("Discussion Similarity Network")
plt.axis('off')  # Hide the axes
plt.show()
