In [2]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Load dataset
data = pd.read_csv('Online_Courses.csv')

# Select relevant columns
selected_columns = ['Title', 'Category', 'Sub-Category', 'Rating', 'Number of viewers', 'Duration']
data_filtered = data[selected_columns]

# Clean 'Rating' column: Remove non-numeric characters and convert to float
data_filtered['Rating'] = (
    data_filtered['Rating']
    .astype(str)
    .str.extract(r'([\d\.]+)')[0]
    .apply(pd.to_numeric, errors='coerce')
)

# Clean 'Number of viewers' column: Remove commas, extract numeric values, and convert to float
data_filtered['Number of viewers'] = (
    data_filtered['Number of viewers']
    .astype(str)
    .str.replace(',', '', regex=False)
    .str.extract(r'([\d]+)')[0]
    .apply(pd.to_numeric, errors='coerce')
)

# Clean 'Duration' column: Extract numeric values (e.g., hours or months) and handle errors
data_filtered['Duration'] = (
    data_filtered['Duration']
    .astype(str)
    .str.extract(r'([\d\.]+)')[0]
    .apply(pd.to_numeric, errors='coerce')
)

# Drop rows with missing or invalid data
data_filtered.dropna(subset=['Rating', 'Number of viewers', 'Duration'], inplace=True)

# Label Encoding for categorical columns
encoder = LabelEncoder()

# Encode 'Category' and 'Sub-Category' columns
data_filtered['Category'] = encoder.fit_transform(data_filtered['Category'].fillna('Unknown'))
data_filtered['Sub-Category'] = encoder.fit_transform(data_filtered['Sub-Category'].fillna('Unknown'))

# Define features and target variable
X = data_filtered.drop(columns=['Title', 'Number of viewers'])  # Features
y = data_filtered['Number of viewers']  # Target

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the regression model
regressor = RandomForestRegressor(random_state=42)
regressor.fit(X_train, y_train)

# Make predictions
y_pred = regressor.predict(X_test)

# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Mean Squared Error: {mse}")
print(f"Mean Absolute Error: {mae}")
print(f"R-squared Score: {r2}")

# Save the model
joblib.dump(regressor, 'course_viewers_regression_model.pkl')
print("Regression model saved as 'course_viewers_regression_model.pkl'")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['Rating'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['Number of viewers'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_filtered['Duration'] = (
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: htt

Mean Squared Error: 321033231.9017358
Mean Absolute Error: 6873.85099699252
R-squared Score: -0.09723831472473887
Regression model saved as 'course_viewers_regression_model.pkl'


In [5]:
# Import required libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
import joblib

# Function to clean numeric columns
def clean_numeric_column(df, column_name, regex=r'([\d\.]+)'):
    """
    Cleans a numeric column by extracting numeric values using regex.

    Args:
        df (pd.DataFrame): The dataframe containing the column.
        column_name (str): The column to be cleaned.
        regex (str): The regex pattern to extract numeric values.

    Returns:
        pd.Series: Cleaned numeric column.
    """
    return (
        df[column_name]
        .astype(str)
        .str.extract(regex)[0]
        .apply(pd.to_numeric, errors='coerce')
    )

# Load dataset
data = pd.read_csv('Online_Courses.csv')

# Select relevant columns
selected_columns = ['Title', 'Category', 'Sub-Category', 'Rating', 'Number of viewers', 'Duration']
data_filtered = data[selected_columns].copy()

# Clean 'Rating' column
data_filtered.loc[:, 'Rating'] = clean_numeric_column(data_filtered, 'Rating')

# Clean 'Number of viewers' column
data_filtered.loc[:, 'Number of viewers'] = (
    clean_numeric_column(data_filtered, 'Number of viewers', regex=r'([\d,]+)')
    .astype(str)
    .str.replace(',', '', regex=False)
    .apply(pd.to_numeric, errors='coerce')
)

# Clean 'Duration' column
data_filtered.loc[:, 'Duration'] = clean_numeric_column(data_filtered, 'Duration')

# Drop rows with missing or invalid data
data_filtered.dropna(subset=['Rating', 'Number of viewers', 'Duration'], inplace=True)

# Encode categorical features
encoder = LabelEncoder()
data_filtered.loc[:, 'Category'] = encoder.fit_transform(data_filtered['Category'].fillna('Unknown'))
data_filtered.loc[:, 'Sub-Category'] = encoder.fit_transform(data_filtered['Sub-Category'].fillna('Unknown'))

# Define features and target variable
X = data_filtered.drop(columns=['Title', 'Number of viewers'])
y = data_filtered['Number of viewers']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a regression model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate model performance
mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print evaluation metrics
print("Model Performance:")
print(f"Mean Squared Error: {mse:.2f}")
print(f"Mean Absolute Error: {mae:.2f}")
print(f"R-squared Score: {r2:.2f}")

# Save the trained model
joblib.dump(model, 'course_viewers_regression_model.pkl')
print("Regression model saved as 'course_viewers_regression_model.pkl'")


Model Performance:
Mean Squared Error: 73674.15
Mean Absolute Error: 207.63
R-squared Score: -0.30
Regression model saved as 'course_viewers_regression_model.pkl'
