In [208]:
from transformers import pipeline, AutoTokenizer, AutoModelForSequenceClassification
import torch
import numpy as np
import pandas as pd
import joblib
import isodate
from datetime import datetime

In [116]:
device1 = 0 if torch.cuda.is_available() else -1

In [117]:
tokenizer = AutoTokenizer.from_pretrained("shahrukhx01/question-vs-statement-classifier")

model = AutoModelForSequenceClassification.from_pretrained("shahrukhx01/question-vs-statement-classifier")



In [118]:
sentiment_pipeline = pipeline("text-classification", device=device1)

No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [119]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 256, padding_idx=0)
      (position_embeddings): Embedding(512, 256)
      (token_type_embeddings): Embedding(2, 256)
      (LayerNorm): LayerNorm((256,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-3): 4 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=256, out_features=256, bias=True)
              (key): Linear(in_features=256, out_features=256, bias=True)
              (value): Linear(in_features=256, out_features=256, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=256, out_features=256, bias=True)
              (LayerNorm): LayerNorm((256,), eps=1e-1

In [120]:
def classify_question_or_statement(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", max_length=512, truncation=True, padding=True)
    
    # Move inputs to the GPU (if available)
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Get the model predictions (logits)
    with torch.no_grad():
        outputs = model(**inputs)
    
    logits = outputs.logits
    
    # Apply softmax to get probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get the predicted class (0 = statement, 1 = question)
    predicted_class = torch.argmax(probabilities, dim=1).item()
    
    # Interpret the class
    if predicted_class == 1:
        return "Question"
    else:
        return "Statement"

In [121]:
text = "Is this a question?"
result = classify_question_or_statement(text)

In [122]:
result

'Question'

In [123]:
df = pd.read_pickle('pickling.pickle')

In [124]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,[I struggle to understand all commands in Pyth...,page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,[bhai mera to connect hi nhi ho rha cluster ke...,scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,[Limited Offer with Coupon Code: NEURALNINE\n5...,scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[I really wish you could speak better English.],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[Thank you for made this video ✨, Maa college ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [125]:
def keep_statements(comments):
    c = []
    for each in comments:
        if classify_question_or_statement(each) == "Statement":
            c.append(each)

    return c

In [126]:
keep_statements(df['comments'][0])

['I struggle to understand all commands in Python, however John has opened the door to me with his videos on scraping, Thank you John',
 'This is too difficult... my python terminal doesnt even recognize the first few commands. Once you get stuck as a beginner youre pretty much screwed if you dont have someone to help you.',
 'Excellent tutorial.',
 "I'm getting a 403 error after pasting the url",
 'Thanks, this was really useful',
 "Sir, I need your help, how can I contact you, it's related a project, I am trying for so many days tomorrow is last day 🙂",
 "Hi John, thanks for share your knowledge! I want to ask you if is it possible to use Scrapy Rule and pass a header to the request of the rule. I need to pass authorization credentials to connect with the API that I'm trying to scrap.\nMany thanks!",
 'ads are forbidden on my computer.',
 'Thank you a lot .......it is really amazing 💐💐',
 'Thank you John! Your explanation is very comprehensive. Great tutorial!',
 'love you',
 "that's

In [127]:
df['comments'] = df['comments'].apply(keep_statements)

In [128]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,[I struggle to understand all commands in Pyth...,page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,[bhai mera to connect hi nhi ho rha cluster ke...,scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,[Limited Offer with Coupon Code: NEURALNINE\n5...,scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[I really wish you could speak better English.],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[Thank you for made this video ✨, Maa college ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [129]:
def sentiment(comments):
    pos_neg_list = []
    for each in comments:
        each = each[:512]
        result = sentiment_pipeline(each)
        pos_neg_list.append(result[0]['label'])

    return pos_neg_list

In [130]:
df['comments'][0][1]

'This is too difficult... my python terminal doesnt even recognize the first few commands. Once you get stuck as a beginner youre pretty much screwed if you dont have someone to help you.'

In [131]:
sentiment(df['comments'][1])

['POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'POSITIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'NEGATIVE',
 'POSITIVE',

In [132]:
df['comments'] = df['comments'].apply(sentiment)

In [133]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[POSITIVE, NEGATIVE, POSITIVE, NEGATIVE, POSIT...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[POSITIVE, POSITIVE, POSITIVE, NEGATIVE, POSIT...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[NEGATIVE, NEGATIVE, POSITIVE, NEGATIVE, NEGAT...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[NEGATIVE],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[POSITIVE, NEGATIVE, POSITIVE, POSITIVE, POSIT...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [134]:
def labels_to_binary(labels):
    return [1 if label == 'POSITIVE' else 0 for label in labels]

In [135]:
labels_to_binary(df['comments'][0])

[1,
 0,
 1,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 1,
 1,
 1,
 1,
 0,
 1,
 1,
 1,
 1,
 0,
 0]

In [136]:
df['comments'] = df['comments'].apply(labels_to_binary)

In [137]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,..."


In [138]:
def calc_positive_ratio(comments):
    total = len(comments)
    if total != 0:
        pos_count = 0
        for each in comments:
            if each == 1:
                pos_count += 1
    
        return (pos_count / total)
    else:
        return 0

In [139]:
calc_positive_ratio(df['comments'][0])

0.6507936507936508

In [140]:
df['comment_ratio'] = df['comments'].apply(calc_positive_ratio)

In [141]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.650794
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.261905
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.541667
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.640449


## MinMax Scaling

0-5

$$v_i = \frac {(v_i - min_A)}{(max_A - min_A)} * (newMax_A - newMin_A) + newMin_A$$

In [142]:
col_max = max(df['comment_ratio'])
col_min = min(df['comment_ratio'])

In [143]:
col_max

1.0

In [144]:
col_min

0.0

In [145]:
def min_max(value):
    num = value - 0
    denom = col_max - col_min
    third = 5 - 0
    newMin = 0

    return round(((num / denom) * third) + newMin, 2)

In [146]:
df['comment_ratio'] = min_max(df['comment_ratio'])

In [147]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2


In [148]:
df.to_pickle('comments.pickle')

## Engagement Ratio

In [149]:
df['engagement_ratio'] = df['likes'] / df['views']

In [150]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio,engagement_ratio
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25,0.019001
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31,0.021968
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71,0.020503
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,0.04902
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2,0.015343


MinMax this column as well

In [151]:
col_max = max(df['engagement_ratio'])
col_min = min(df['engagement_ratio'])

In [152]:
col_max

0.09189866613594476

In [153]:
col_min

0.0

In [None]:
def min_max(value):
    num = value - 0
    denom = col_max - col_min
    third = 5 - 0
    newMin = 0

    return round(((num / denom) * third) + newMin, 2)

In [155]:
df['engagement_ratio'] = min_max(df['engagement_ratio'])

In [156]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio,engagement_ratio
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25,1.03
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31,1.2
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71,1.12
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,2.67
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2,0.83


Planning on a 60/40 split for comment/engagement.

In [176]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio,engagement_ratio
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25,1.03
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31,1.2
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71,1.12
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,2.67
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2,0.83


In [177]:
df['final_rating'] = round(0.6 * df['comment_ratio'] + 0.4 * df['engagement_ratio'], 2)

In [178]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio,engagement_ratio,final_rating
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25,1.03,2.36
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31,1.2,1.27
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71,1.12,2.07
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,2.67,1.07
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2,0.83,2.25


In [183]:
col_max = max(df['final_rating'])

In [184]:
col_min = min(df['final_rating'])

In [186]:
col_max

3.73

In [187]:
col_min

0.0

In [188]:
def min_max(value):
    num = value - 0
    denom = col_max - col_min
    third = 5 - 0
    newMin = 0

    return round(((num / denom) * third) + newMin, 2)

In [192]:
df['final_rating'] = df['final_rating'].apply(min_max)

In [193]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio,engagement_ratio,final_rating
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25,1.03,3.16
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31,1.2,1.7
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71,1.12,2.77
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,2.67,1.43
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2,0.83,3.02


## Subject Classification

In [199]:
mnb = joblib.load('mnb_model.joblib')

In [202]:
mnb.predict(df['vectors'][0])

array(['de2'], dtype='<U6')

In [203]:
df['subject'] = df['vectors'].apply(lambda x: mnb.predict(x)[0])

In [204]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio,engagement_ratio,final_rating,subject
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,PT23M22S,2020-12-09T19:00:09Z,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25,1.03,3.16,de2
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,PT50M3S,2023-06-21T11:30:04Z,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31,1.2,1.7,fsd
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,PT34M31S,2022-11-23T13:30:12Z,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71,1.12,2.77,de2
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,PT1H59M,2023-03-22T06:07:19Z,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,2.67,1.43,de2
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,PT10H32M7S,2022-03-06T18:16:41Z,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2,0.83,3.02,pf


In [209]:
df['duration'] = df['duration'].apply(lambda x: isodate.parse_duration(x).seconds)

In [210]:
df['publisedAt'] = df['publisedAt'].apply(lambda x: datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ'))

In [211]:
df.head()

Unnamed: 0,videoId,title,description,views,likes,tag,url,duration,publisedAt,comments,combo,vectors,comment_ratio,engagement_ratio,final_rating,subject
0,s4jtkzHhLzY,Scrapy for Beginners - A Complete How To Examp...,discord beginners scrapy new python aimed tuto...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,1402,2020-12-09 19:00:09,"[1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, ...",page scraping python scrapy web multiple shell...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.25,1.03,3.16,de2
1,GogxAQ2JP4A,Web Scraping using Scrapy | Scrapy Tutorial + ...,"1 javascript student course c ++, min ]: css 4...",46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,3003,2023-06-21 11:30:04,"[1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, ...",scraping python 1 c web mongodb css min data j...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",1.31,1.2,1.7,fsd
2,m_3gjHGxIJc,Coding Web Crawler in Python with Scrapy,algorithm outro books proxy learn intro websit...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,2071,2022-11-23 13:30:12,"[0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, ...",scraping python web algorithm proxy programmin...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",2.71,1.12,2.77,de2
3,41opDqo1im8,Python Scrapy Tutorial for Beginners,beginners try introduction also newbies toward...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,7140,2023-03-22 06:07:19,[0],scraping python web try learning scrapy,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",0.0,2.67,1.43,de2
4,irqbmMNs2Bo,C Language Tutorial for Beginners (with Notes ...,00java live day -- complete class 5000 life in...,32553624,499466,C++ C++ coding C++ full course C++ placement c...,https://www.youtube.com/watch?v=irqbmMNs2Bo,37927,2022-03-06 18:16:41,"[1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, ...",code c full c++ language types alternate progr...,"[[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",3.2,0.83,3.02,pf


In [212]:
df.to_pickle('comments.pickle')

In [217]:
df.to_json('comments.json')

In [221]:
df.to_csv('comments.csv')

In [218]:
json_str = df.to_json()

In [219]:
print(json_str)

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [220]:
with open('output.json', 'w', encoding='utf-8') as f:
    df.to_json(f, force_ascii=False)