### 1. Loading the Data

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
student_data= pd.read_csv('../Data/final_student_data.csv')
job_data = pd.read_csv('../Data/final_job_data.csv')
course_data = pd.read_csv('../Data/final_course_data.csv')

In [3]:
course_data['Category'].unique()
course_data['Category'].value_counts()

Category
Business                            624
Data Science                        387
Computer Science                    341
Information Technology              166
Health                              153
Arts and Humanities                  87
Physical Science and Engineering     81
Personal Development                 57
Social Sciences                      50
Language Learning                    34
Math and Logic                       11
计算机科学                                 1
Ciencia de Datos                      1
Negocios                              1
Ciencias de la Computación            1
Negócios                              1
データサイエンス                              1
Tecnologia da informação              1
Name: count, dtype: int64

In [4]:
print(student_data.info())
print(course_data.info())
print(job_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 86 entries, 0 to 85
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Branch           86 non-null     object
 1   Percentage_10th  86 non-null     int64 
 2   Percentage_12th  86 non-null     int64 
 3   Skills           56 non-null     object
 4   Career_Goal      86 non-null     object
 5   StudentId        86 non-null     int64 
dtypes: int64(3), object(3)
memory usage: 4.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1998 entries, 0 to 1997
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   Title        1998 non-null   object
 1   Category     1998 non-null   object
 2   Course Type  1998 non-null   object
 3   Skills       1998 non-null   object
 4   course_id    1998 non-null   object
dtypes: object(5)
memory usage: 78.2+ KB
None
<class 'pandas.core.frame.DataFrame'>
RangeI

In [5]:
print("Student Data:")
display(student_data.head())

print("Course Data:")
display(course_data.head())

print("Job Data:")
display(job_data.head())

Student Data:


Unnamed: 0,Branch,Percentage_10th,Percentage_12th,Skills,Career_Goal,StudentId
0,CSIT,85,85,"Python, R, Data Analysis, Machine Learning, SQ...",Data Scientist,1
1,CSIT,85,95,"Python, R, Data Analysis, Machine Learning, SQ...",Data Scientist,2
2,CSIT,85,85,"Python, R, Data Analysis, Machine Learning, SQ...",Data Scientist,3
3,CS,85,95,,Cybersecurity Analyst,4
4,CS,85,95,"Python, Java, C++, Data Structures, Algorithms...",Software Engineer,5


Course Data:


Unnamed: 0,Title,Category,Course Type,Skills,course_id
0,Machine Learning Specialization,Data Science,Specialization,"Decision Trees, Artificial Neural Network, Log...",machine_learning_specialization
1,Introduction to Data Science Specialization,Data Science,Specialization,"Data Science, Relational Database Management S...",introduction_to_data_science_specialization
2,Data Science Fundamentals with Python and SQL ...,Data Science,Specialization,"Data Science, Github, Python Programming, Jupy...",data_science_fundamentals_with_python_and_sql_...
3,Key Technologies for Business Specialization,Business,Specialization,"Data Science, Artificial Intelligence (AI), Bu...",key_technologies_for_business_specialization
4,Deep Learning Specialization,Data Science,Specialization,"Artificial Neural Network, Convolutional Neura...",deep_learning_specialization


Job Data:


Unnamed: 0,Job Posting ID,Job Posting Date,Job Title,Job Title Full,Job Skills,Job Location,Company Name
0,2701524240,2017-01-01,Software Engineer,Software Engineer,"['database', 'javascript', 'agile', 'linux', '...",United States,"Cardinal Financial Company, Limited Partnership"
1,2719108338,2017-01-01,Data Engineer,Senior Azure Data Engineer,"['data_lake', 'cloud', 'python', 'spark', 'git...",United States,Brinks Home
2,2719503370,2017-01-01,Software Engineer,Software Engineer I,"['mongo', 'oracle', 'microsoft', 'css', 'javas...",United States,Paycor
3,2734877741,2017-01-01,Business Analyst,Associate Business Analyst,"['agile', 'excel']","Phoenix, AZ",Optum
4,2752415616,2017-01-01,Developer,Swift Developer,"['excel', 'back-end', 'ios', 'swift', 'program...","Richmond, CA",Toptal


### Data Merging

In [6]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import random

# Set random seed for reproducibility
random.seed(42)

# Load data
students_df = pd.read_csv('../Data/final_student_data.csv')
courses_df = pd.read_csv('../Data/final_course_data.csv')

print(f"Loaded {len(students_df)} students and {len(courses_df)} courses")

# Preprocess text columns
def clean_text(text):
    if not isinstance(text, str) or pd.isna(text):
        return ''
    text = ''.join([char for char in text.lower() if char.isalpha() or char.isspace()])
    tokens = text.split()
    return ' '.join(tokens)

students_df['skills_processed'] = students_df['Skills'].apply(clean_text)
students_df['career_goal_processed'] = students_df['Career_Goal'].apply(clean_text)
courses_df['skills_processed'] = courses_df['Skills'].apply(clean_text)
courses_df['category_processed'] = courses_df['Category'].apply(clean_text)

# Calculate how many courses per student we need to reach 3000 rows
target_dataset_size = 3000
courses_per_student = max(15, int(np.ceil(target_dataset_size / len(students_df))))
print(f"Generating approximately {courses_per_student} courses per student to reach target size")

# Randomized matching with increased courses per student
matches = []
vectorizer = TfidfVectorizer()

# Process each student
for _, student in students_df.iterrows():
    student_id = student['StudentId']
    student_text = f"{student['skills_processed']} {student['career_goal_processed']}"
    
    # Calculate similarity for all courses
    course_similarities = []
    for _, course in courses_df.iterrows():
        course_id = course['course_id']
        course_text = f"{course['skills_processed']} {course['category_processed']}"
        
        corpus = [student_text, course_text]
        try:
            tfidf_matrix = vectorizer.fit_transform(corpus)
            base_similarity = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
            
            # Add randomization factor (0-20% boost)
            random_factor = random.uniform(0, 0.2)
            adjusted_similarity = base_similarity + random_factor
            
            course_similarities.append({
                'course_id': course_id,
                'similarity_score': adjusted_similarity
            })
        except:
            continue
    
    # Sort by similarity
    course_similarities.sort(key=lambda x: x['similarity_score'], reverse=True)
    
    # Take top courses for each student
    top_courses = course_similarities[:min(courses_per_student, len(course_similarities))]
    
    # Add to matches
    for course in top_courses:
        matches.append({
            'student_id': student_id,
            'course_id': course['course_id'],
            'similarity_score': course['similarity_score']
        })

# Convert to DataFrame
matches_df = pd.DataFrame(matches)

# If we still don't have enough matches, duplicate some with slight variations
if len(matches_df) < target_dataset_size:
    needed_duplicates = target_dataset_size - len(matches_df)
    duplicates = matches_df.sample(needed_duplicates, replace=True)
    
    # Add small random variations to similarity scores
    duplicates['similarity_score'] = duplicates['similarity_score'].apply(
        lambda x: max(0, min(1, x + random.uniform(-0.05, 0.05)))
    )
    
    matches_df = pd.concat([matches_df, duplicates], ignore_index=True)

# Shuffle the final matches
matches_df = matches_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Create final dataset with all information
final_df = matches_df.merge(students_df, left_on='student_id', right_on='StudentId', how='left')
final_df = final_df.merge(courses_df, on='course_id', how='left')

# Save result
final_df.to_csv('../Data/student_course_3000.csv', index=False)
print(f"Created dataset with {len(final_df)} rows")

final_df.head()


Loaded 86 students and 1998 courses


Generating approximately 35 courses per student to reach target size
Created dataset with 3010 rows


Unnamed: 0,student_id,course_id,similarity_score,Branch,Percentage_10th,Percentage_12th,Skills_x,Career_Goal,StudentId,skills_processed_x,career_goal_processed,Title,Category,Course Type,Skills_y,skills_processed_y,category_processed
0,83,google_data_analytics_capstone_complete_a_case...,0.198736,MCA,75,65,"HTML, CSS, JavaScript, React, Node.js, Angular...",Web Developer,83,html css javascript react nodejs angular apis ...,web developer,Google Data Analytics Capstone: Complete a Cas...,Data Science,Course,"Job portfolio,Data Cleansing,Data Analysis,Dat...",job portfoliodata cleansingdata analysisdata v...,data science
1,17,1mindfulness_in_integrative_healthcare,0.198479,CSIT,85,75,"HTML, CSS, JavaScript, React, Node.js, Angular...",Web Developer,17,html css javascript react nodejs angular apis ...,web developer,Mindfulness in Integrative Healthcare,Health,Course,"wellbeing,improved symptom management,whole-pe...",wellbeingimproved symptom managementwholeperso...,health
2,83,web_design_for_everybody_basics_of_web_develop...,0.411329,MCA,75,65,"HTML, CSS, JavaScript, React, Node.js, Angular...",Web Developer,83,html css javascript react nodejs angular apis ...,web developer,Web Design for Everybody: Basics of Web Develo...,Computer Science,Specialization,"Web Development, Cascading Style Sheets (CSS),...",web development cascading style sheets css htm...,computer science
3,2,ethical_issues_in_data_science,0.613531,CSIT,85,95,"Python, R, Data Analysis, Machine Learning, SQ...",Data Scientist,2,python r data analysis machine learning sql da...,data scientist,Ethical Issues in Data Science,Data Science,Course,"Data Science,Ethics,Algorithms,Privacy,Philoso...",data scienceethicsalgorithmsprivacyphilosophy,data science
4,40,introduction_to_programming_with_python_and_ja...,0.219441,CSME,85,75,"Automation Systems, PLC Programming, Control S...",Automation Engineer,40,automation systems plc programming control sys...,automation engineer,Introduction to Programming with Python and Ja...,Computer Science,Specialization,"Programming Principles, Python Programming, Ja...",programming principles python programming java...,computer science
