Joining the needed Tables

In [27]:
import pandas as pd
import os

# Load the CSV files
enrollments_df = pd.read_csv('enrollments_raw.csv')
completions_df = pd.read_csv('course_completions_raw.csv')

# Perform the merge on the 'id' in enrollments and 'enrollmentId' in completions
merged_df = pd.merge(enrollments_df, completions_df, left_on='id', right_on='enrollmentId')

# Ensure the 'datasets' folder exists
os.makedirs('datasets', exist_ok=True)

# Save the result to the 'datasets' folder
merged_df.to_csv('datasets/enrollments_with_completions.csv', index=False)

# Display confirmation
print("Merged file saved to 'datasets/enrollments_with_completions.csv'")


Merged file saved to 'datasets/enrollments_with_completions.csv'


Group by for calculating  average Score

In [28]:
import pandas as pd

# Load the merged CSV file
df = pd.read_csv('datasets/enrollments_with_completions.csv')

# Filter rows where completionStatus is 'Completed'
df_completed = df[df['completionStatus'] == 'Completed']

# Group by userId and learningPathId, then calculate the average score
average_scores = df_completed.groupby(['userId', 'learningPathId'])['score'].mean().reset_index()

# Rename the 'score' column to 'average_score'
average_scores.rename(columns={'score': 'average_score'}, inplace=True)

# Display the result
print(average_scores)

# Optionally, save the result to a CSV file in the 'datasets' folder
average_scores.to_csv('datasets/user_learning_path_average_scores.csv', index=False)

# Display confirmation message
print("Average scores saved to 'datasets/user_learning_path_average_scores.csv'")


      userId  learningPathId  average_score
0    EMP0003             9.0          71.73
1    EMP0003            10.0          89.53
2    EMP0005             7.0          73.12
3    EMP0006             5.0          81.49
4    EMP0010             9.0          85.30
..       ...             ...            ...
261  EMP0983             3.0          67.12
262  EMP0984             1.0          68.16
263  EMP0991            10.0          74.47
264  EMP0992             4.0          63.12
265  EMP0998             3.0          84.52

[266 rows x 3 columns]
Average scores saved to 'datasets/user_learning_path_average_scores.csv'


In [34]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

# Load the data
data = pd.read_csv('datasets/user_learning_path_average_scores.csv')

# Display the first few rows
print(data.head())

# Encode categorical variables (userId and learningPathId)
label_encoder_user = LabelEncoder()
data['userId'] = label_encoder_user.fit_transform(data['userId'])

label_encoder_path = LabelEncoder()
data['learningPathId'] = label_encoder_path.fit_transform(data['learningPathId'])

# Define features and target
X = data[['userId', 'learningPathId']]  # Features
y = data['average_score']  # Target variable (average scores)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Random Forest Regressor
rf_regressor = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
rf_regressor.fit(X_train, y_train)

# Make predictions
y_pred = rf_regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse:.2f}')
print(f'R² Score: {r2:.2f}')

# Optionally, display predictions alongside actual values
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
print(predictions_df.head())


    userId  learningPathId  average_score
0  EMP0003             9.0          71.73
1  EMP0003            10.0          89.53
2  EMP0005             7.0          73.12
3  EMP0006             5.0          81.49
4  EMP0010             9.0          85.30
Mean Squared Error: 180.91
R² Score: -0.42
     Actual  Predicted
181   70.91    65.7652
119   70.47    75.2527
139   67.64    78.8045
216   73.58    70.9345
45    67.85    79.5074


predicting the learningpath

In [33]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load the average scores CSV file
average_scores = pd.read_csv('datasets/user_learning_path_average_scores.csv')

# Prepare features and target
# Create dummy variables for learningPathId
X = pd.get_dummies(average_scores[['learningPathId']], drop_first=True)
y = average_scores['average_score']

# Initialize and train the Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X, y)

# Make predictions for each user for all their enrolled learning paths
predictions = rf_model.predict(X)

# Create a DataFrame to hold predictions along with userId and learningPathId
predicted_scores = average_scores[['userId', 'learningPathId']].copy()
predicted_scores['predicted_average_score'] = predictions

# Find the learning path with the maximum predicted score for each user
recommended_paths = predicted_scores.loc[predicted_scores.groupby('userId')['predicted_average_score'].idxmax()]

# Select only the required columns for output
recommended_paths = recommended_paths[['userId', 'learningPathId']]
recommended_paths.columns = ['userId', 'recommended_learningPathId']  # Rename for clarity

# Optionally, reset index if needed
recommended_paths.reset_index(drop=True, inplace=True)

# Display the recommendations
print(recommended_paths)

# Save the output to a new CSV file
recommended_paths.to_csv('datasets/recommended_learning_paths.csv', index=False)

# Display confirmation message
print("Recommended learning paths saved to 'datasets/recommended_learning_paths.csv'")


      userId  recommended_learningPathId
0    EMP0003                        10.0
1    EMP0005                         7.0
2    EMP0006                         5.0
3    EMP0010                         9.0
4    EMP0019                        10.0
..       ...                         ...
234  EMP0983                         3.0
235  EMP0984                         1.0
236  EMP0991                        10.0
237  EMP0992                         4.0
238  EMP0998                         3.0

[239 rows x 2 columns]
Recommended learning paths saved to 'datasets/recommended_learning_paths.csv'
