In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import classification_report, accuracy_score, mean_squared_error, r2_score

In [None]:
# open the dataset
df = pd.read_csv("youtube.csv")

In [None]:
# Display first few rows of the dataset
print(df.head())

   category_id      views     likes  dislikes  comment_count  \
0           22   748374.0   57527.0      2966        15954.0   
1           24  2418783.0   97185.0      6146        12703.0   
2           23   681944.0  146033.0      5339         8181.0   
3           24   343168.0   10172.0       666         2146.0   
4           24  2095731.0  132235.0      1989        17518.0   

   comments_disabled  ratings_disabled  video_error_or_removed  
0                  0                 0                       0  
1                  0                 0                       0  
2                  0                 0                       0  
3                  0                 0                       0  
4                  0                 0                       0  


In [None]:
len(df)

40949

In [None]:
df

Unnamed: 0,category_id,views,likes,dislikes,comment_count,comments_disabled,ratings_disabled,video_error_or_removed
0,22,748374.0,57527.0,2966,15954.0,0,0,0
1,24,2418783.0,97185.0,6146,12703.0,0,0,0
2,23,681944.0,146033.0,5339,8181.0,0,0,0
3,24,343168.0,10172.0,666,2146.0,0,0,0
4,24,2095731.0,132235.0,1989,17518.0,0,0,0
...,...,...,...,...,...,...,...,...
40944,15,1685609.0,38160.0,1385,2657.0,0,0,0
40945,22,1064798.0,60008.0,382,3936.0,0,0,0
40946,24,1066451.0,48068.0,1032,3992.0,0,0,0
40947,1,5660813.0,192957.0,2846,1855.0,0,0,0


In [None]:
small_df = df[0:1000]

In [None]:
# Task 1: Preprocessing for Classification
# Classification: Predict whether comments are disabled ('comments_disabled')

classification_features = ['views', 'likes', 'dislikes', 'comment_count', 'ratings_disabled', 'video_error_or_removed']
X_class = df[classification_features]
y_class = df['comments_disabled']


In [None]:
# Split into train and test sets for classification
X_class_train, X_class_test, y_class_train, y_class_test = train_test_split(
    X_class, y_class, test_size=0.3, random_state=42
)

In [None]:
# Logistic Regression Model for Classification
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_class_train, y_class_train)

In [None]:
# Make predictions for classification
y_class_pred = log_reg.predict(X_class_test)

In [None]:
# Evaluate Logistic Regression
print("Classification Report (Logistic Regression):")
print(classification_report(y_class_test, y_class_pred))
print("Accuracy (Logistic Regression):", accuracy_score(y_class_test, y_class_pred))


Classification Report (Logistic Regression):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00     12087
           1       0.76      0.95      0.85       198

    accuracy                           0.99     12285
   macro avg       0.88      0.97      0.92     12285
weighted avg       1.00      0.99      0.99     12285

Accuracy (Logistic Regression): 0.9943833943833944


In [None]:
# Task 2: Preprocessing for Regression
# Regression: Predict 'comment_count' (comment counts)
regression_features = ['views', 'likes', 'dislikes', 'ratings_disabled', 'video_error_or_removed', 'comments_disabled']
X_reg = df[regression_features]
y_reg = df['comment_count']


In [None]:
# Split into train and test sets for regression
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(
    X_reg, y_reg, test_size=0.3, random_state=42
)

In [None]:
# Multiple Linear Regression for Regression
lin_reg = LinearRegression()
lin_reg.fit(X_reg_train, y_reg_train)

In [None]:
# Make predictions for regression
y_reg_pred = lin_reg.predict(X_reg_test)

In [None]:
# Evaluate Multiple Linear Regression
mse_linreg = mean_squared_error(y_reg_test, y_reg_pred)
r2_linreg = r2_score(y_reg_test, y_reg_pred)

In [None]:
print("Mean Squared Error (Multiple Linear Regression):", mse_linreg)
print("R2 Score (Multiple Linear Regression):", r2_linreg)

Mean Squared Error (Multiple Linear Regression): 476964833.95216674
R2 Score (Multiple Linear Regression): 0.621709592385318


In [None]:
print("Mean Squared Error (Multiple Linear Regression):", mse_linreg)
print("R2 Score (Multiple Linear Regression):", r2_linreg)

Mean Squared Error (Multiple Linear Regression): 476964833.95216674
R2 Score (Multiple Linear Regression): 0.621709592385318


In [None]:
print("Mean Squared Error (Multiple Linear Regression):", mse_linreg)
print("R2 Score (Multiple Linear Regression):", r2_linreg)


Mean Squared Error (Multiple Linear Regression): 476964833.95216674
R2 Score (Multiple Linear Regression): 0.621709592385318


In [None]:
# Conclusion and Notes
print("\n--- Summary ---")
print(f"Classification Accuracy: {accuracy_score(y_class_test, y_class_pred):.2f}")
print(f"Regression MSE: {mse_linreg:.2f}")
print(f"Regression R2: {r2_linreg:.2f}")


--- Summary ---
Classification Accuracy: 0.99
Regression MSE: 476964833.95
Regression R2: 0.62


Objectives:
I will be using Regression Task: Predict the comment count for a given video.


Model Choice:
Logistic Regression: Predicts whether comments are disabled (comments_disabled).
Multiple Linear Regression: Predicts the number of comments (comment_count).



Explanation of the Code:
1.	Data Loading:
The dataset is loaded into a DataFrame using pd.read_csv(file_path).
2.	Classification Task:
We use Logistic Regression to predict whether the comments are disabled (comments_disabled). Features include views, likes, dislikes, comment_count, etc. We split the data into training and testing sets and evaluate the model using accuracy and F1 score.
3.	Regression Task:
We use Multiple Linear Regression to predict the comment_count using features like views, likes, dislikes, etc. The model is evaluated using Mean Squared Error (MSE) and R² Score.
4.	Summary:
The code outputs the accuracy and F1 score for the classification model, and MSE and R² score for the regression model, followed by a summary.


Performance Analysis and Conclusion

Classification: Logistic Regression
•	Accuracy: Measures the percentage of correctly classified instances. A higher value indicates better performance.
•	F1 Score: Provides a balance between precision and recall, especially useful when classes are imbalanced.
Regression: Multiple Linear Regression
•	Mean Squared Error (MSE): Measures the average squared difference between the predicted and actual values. A lower value indicates better performance.
•	R² Score: Measures how well the model explains the variance in the data. A value closer to 1 means better performance.
Comparison and Conclusion:
•	Logistic Regression: This model performs well for binary classification tasks, such as predicting whether comments are disabled, and is easy to interpret.
•	Multiple Linear Regression: Although simpler, this model works well for predicting continuous variables like comment_count. However, if the relationship between features and the target is non-linear, this model might not perform as well as other advanced models like Random Forest.



Recommendation:
•	For the classification task, Logistic Regression is the preferred model due to its simplicity, ease of use, and good performance.
•	For the regression task, Multiple Linear Regression works well as a baseline model. If more complex relationships are found, more sophisticated models could be considered.
