In [1]:
pip install pandas scikit-learn joblib


Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl.metadata (15 kB)
Collecting joblib
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp310-cp310-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
   ---------------------------------------- 0.0/11.1 MB 393.8 kB/s eta 0:00:29
   ---------------------------------------- 0.1/11.1 MB 573.4 kB/s eta 0:00:20
   ---------------------------------------- 0.1/11.1 MB 654.9 kB/s eta 0:00:17
   ---------------------------------------- 0.1/11.1 MB 654.9 kB/s eta 0:00:17
   --------------------------------

In [13]:

# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder
import joblib

# Function to clean numeric columns
def clean_numeric_column(df, column_name, regex_pattern):
    df[column_name] = (
        df[column_name]
        .astype(str)  # Ensure it's a string
        .str.extract(regex_pattern)  # Extract numeric part
        .astype(float)  # Convert to float
    )

# Load dataset
data = pd.read_csv('Online_Courses.csv')

# Select relevant columns
selected_columns = ['Title', 'Category', 'Sub-Category', 'Rating', 'Number of viewers', 'Duration']
data_filtered = data[selected_columns].copy()  # Use copy to avoid SettingWithCopyWarning

# Clean 'Rating' column: Remove non-numeric characters and convert to float
clean_numeric_column(data_filtered, 'Rating', r'([\d\.]+)')

# Clean 'Number of viewers' column: Remove commas, extract numeric values, and convert to float
data_filtered['Number of viewers'] = (
    data_filtered['Number of viewers']
    .astype(str)  # Ensure it's a string
    .str.replace(',', '', regex=False)  # Remove commas
    .str.extract(r'([\d]+)')  # Extract numeric values
    .astype(float)  # Convert to float
)

# Clean 'Duration' column: Extract numeric values and convert to float
clean_numeric_column(data_filtered, 'Duration', r'(\d+\.?\d*)')

# Drop rows with missing or invalid data
data_filtered.dropna(subset=['Rating', 'Number of viewers', 'Duration'], inplace=True)

# Create the target variable (Popular based on median viewers and ratings >= 4)
median_viewers = data_filtered['Number of viewers'].median()
data_filtered['Popular'] = (
    (data_filtered['Rating'] >= 4) & 
    (data_filtered['Number of viewers'] >= median_viewers)
).astype(int)

# Label Encoding for categorical columns
encoder = LabelEncoder()
for column in ['Category', 'Sub-Category']:
    data_filtered[column] = encoder.fit_transform(data_filtered[column].fillna('Unknown'))

# Split data into train and test sets
X = data_filtered.drop(columns=['Title', 'Popular'])  # Features
y = data_filtered['Popular']  # Target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Cross-validation for better performance evaluation
cv_scores = cross_val_score(model, X, y, cv=5)
print(f"Cross-validated accuracy: {cv_scores.mean()}")

# Evaluate the model
y_pred = model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(classification_report(y_test, y_pred))

# Save the model
joblib.dump(model, 'course_popularity_model.pkl')
print("Model saved as 'course_popularity_model.pkl'")


Cross-validated accuracy: 0.9996289424860854
Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       278
           1       1.00      1.00      1.00       261

    accuracy                           1.00       539
   macro avg       1.00      1.00      1.00       539
weighted avg       1.00      1.00      1.00       539

Model saved as 'course_popularity_model.pkl'
