In [983]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
import nltk
from rake_nltk import Rake

%matplotlib inline

In [984]:
subjects = pd.read_json('subjects.syllabus.json')

In [985]:
subjects = subjects.drop('_id', axis=1)

In [986]:
subjects

Unnamed: 0,title,syllabus
0,chem,[Chemistry in Day- Today Life: Overview of rol...
1,cn,[Fundamentals of networking and data communica...
2,DBMS,"[DBMS Vs File Systems, Database System Archite..."
3,dmgt,"[Set Theory: Sets, Combinations of sets, Venn ..."
4,FDS,"[Introduction to Data Structures: Data, Data O..."
5,mech,[Introduction and Operations with Forces: Intr...
6,oop,[Introduction to Object Oriented Programming (...
7,OS,[Operating System objectives and its evolution...
8,aies,[Introduction to Artificial Intelligence and S...
9,bdt,[Fundamentals and Basics of Big Data:Introduct...


## Convert list to concat strings

In [987]:
" ".join(subjects.syllabus[0])

'Chemistry in Day- Today Life: Overview of role of Chemistry in our daily life Water Technology and Phase Rule: Importance of potable water Specifications for drinking water Water analysis â€“ hardness alkalinity chloride and dissolved oxygen Ill effects of hard water \nin boilers Boiler feed water treatments- Zeolite and Ion exchange numerical Desalination techniques reverse osmosis and electro-dialysis Phase rule -one component system- Water Applications and limitations of Phase rule Advanced Electrochemistry-Nernst equation Electrodes Primary and secondary batteries Fuel Cells â€“ Definition advantages and limitations Types of Fuel cells-AFC SOFC PEMFC and PAFC supercapacitors Corrosion Science: Types of corrosion- Dry corrosion- mechanism Pilling-Bedworth rule Wet corrosion - mechanism factors influencing corrosion Types of corrosion-stress corrosion pitting corrosion crevice corrosion intergranular corrosion water corrosion Methods of corrosion control- Cathodic protection and ano

In [988]:
subjects.syllabus = subjects.syllabus.apply(lambda x: " ".join(x))

## Normalize to lowercase

In [989]:
subjects.syllabus = subjects.syllabus.apply(lambda x: x.lower())

In [990]:
subjects.title = subjects.title.apply(lambda x: x.lower())

In [991]:
subjects

Unnamed: 0,title,syllabus
0,chem,chemistry in day- today life: overview of role...
1,cn,fundamentals of networking and data communicat...
2,dbms,dbms vs file systems database system architect...
3,dmgt,set theory: sets combinations of sets venn dia...
4,fds,introduction to data structures: data data obj...
5,mech,introduction and operations with forces: intro...
6,oop,introduction to object oriented programming (o...
7,os,operating system objectives and its evolution ...
8,aies,introduction to artificial intelligence and se...
9,bdt,fundamentals and basics of big data:introducti...


## Perform POS Tagging

In [992]:
nlp = spacy.load('en_core_web_sm')

In [993]:
chem_doc = nlp(subjects.syllabus[0])

In [994]:
chem_doc

chemistry in day- today life: overview of role of chemistry in our daily life water technology and phase rule: importance of potable water specifications for drinking water water analysis â€“ hardness alkalinity chloride and dissolved oxygen ill effects of hard water 
in boilers boiler feed water treatments- zeolite and ion exchange numerical desalination techniques reverse osmosis and electro-dialysis phase rule -one component system- water applications and limitations of phase rule advanced electrochemistry-nernst equation electrodes primary and secondary batteries fuel cells â€“ definition advantages and limitations types of fuel cells-afc sofc pemfc and pafc supercapacitors corrosion science: types of corrosion- dry corrosion- mechanism pilling-bedworth rule wet corrosion - mechanism factors influencing corrosion types of corrosion-stress corrosion pitting corrosion crevice corrosion intergranular corrosion water corrosion methods of corrosion control- cathodic protection and anodi

In [995]:
chem_doc[0].pos_

'NOUN'

In [996]:
chem_doc[1].pos_

'ADP'

- NOUN
- PROPN
- ADJ
- VERB

In [997]:
unique_pos = set()
for word in chem_doc:
    unique_pos.add(word.pos_)

In [998]:
unique_pos

{'ADJ',
 'ADP',
 'CCONJ',
 'NOUN',
 'NUM',
 'PRON',
 'PROPN',
 'PUNCT',
 'SPACE',
 'VERB',
 'X'}

In [999]:
chem_doc

chemistry in day- today life: overview of role of chemistry in our daily life water technology and phase rule: importance of potable water specifications for drinking water water analysis â€“ hardness alkalinity chloride and dissolved oxygen ill effects of hard water 
in boilers boiler feed water treatments- zeolite and ion exchange numerical desalination techniques reverse osmosis and electro-dialysis phase rule -one component system- water applications and limitations of phase rule advanced electrochemistry-nernst equation electrodes primary and secondary batteries fuel cells â€“ definition advantages and limitations types of fuel cells-afc sofc pemfc and pafc supercapacitors corrosion science: types of corrosion- dry corrosion- mechanism pilling-bedworth rule wet corrosion - mechanism factors influencing corrosion types of corrosion-stress corrosion pitting corrosion crevice corrosion intergranular corrosion water corrosion methods of corrosion control- cathodic protection and anodi

In [1000]:
for word in chem_doc:
    if word.pos_ == 'PRON':
        print(word)

our
their


In [1001]:
useful_pos = ['NOUN', 'PROPN', 'ADJ', 'VERB']

In [1002]:
pos_tagged = [word for word in chem_doc if word.pos_ in useful_pos]

In [1003]:
pos_tagged[0].orth_

'chemistry'

In [1004]:
pos_tagged_str = []
for each in pos_tagged:
    pos_tagged_str.append(each.orth_)

In [1005]:
chem_text = " ".join(pos_tagged_str)

In [1006]:
chem_text

'chemistry day- today life overview role chemistry daily life water technology phase rule importance potable water specifications drinking water water analysis hardness alkalinity chloride dissolved oxygen ill effects hard water boilers boiler feed water treatments- zeolite ion exchange numerical desalination techniques reverse osmosis electro dialysis phase rule component system- water applications limitations phase rule advanced electrochemistry nernst equation electrodes primary secondary batteries fuel cells definition advantages limitations types fuel cells afc sofc pemfc pafc supercapacitors corrosion science types dry corrosion- mechanism pilling bedworth rule wet corrosion mechanism factors influencing corrosion types corrosion stress corrosion pitting corrosion crevice corrosion intergranular corrosion water corrosion methods corrosion control- cathodic protection anodic protective coatings- metallic coatings nonmetallic coatings powder coating plasma spraying method electrost

## Lemmetization

In [1007]:
from nltk.stem import WordNetLemmatizer

In [1008]:
lemmatizer = WordNetLemmatizer()

In [1009]:
lemmatized_words = [lemmatizer.lemmatize(word) for word in pos_tagged_str]

### Remove duplicates

In [1010]:
lemmatized_words = set(lemmatized_words)

In [1011]:
chem_text = " ".join(lemmatized_words)

In [1012]:
chem_text

'diesel solid epoxy thermosetting acrylonitrile day- alkalinity smart certain uv calorific limitation thermoplastic wet biodegradable classification butadiene definition potable nernst overview powder advantage ultimate method specialty hydrogenation visible spraying electrochemistry day cell pemfc recycling crevice transition system- dry pathway gas ion catalyst quality styrene specification addition resin battery ill reaction oxygen natural crystallinity sofc compounding gaseous hard petroleum treatments- equation boiler based solvent daily role electrode functionality technique absorption coatings- tg biodiesel application traditional analysis science influencing principle petrol liquefied chemical life metallic value primary composition required afc refining importance intergranular pitting chemistry coating dialysis cathodic polypropylene plastic rule nonmetallic hydrogen bedworth coal dissolved combustion numerical chloride reverse bio mechanism today monomer protective conductin

## Keyword Extraction

In [1013]:
rake = Rake(include_repeated_phrases=False)

In [1014]:
rake.extract_keywords_from_text(chem_text)

In [1015]:
keywords = rake.get_ranked_phrases()

In [1016]:
empty = ""
for keyword in keywords:
    print(keyword, "\n")
    empty += keyword

tg biodiesel application traditional analysis science influencing principle petrol liquefied chemical life metallic value primary composition required afc refining importance intergranular pitting chemistry coating dialysis cathodic polypropylene plastic rule nonmetallic hydrogen bedworth coal dissolved combustion numerical chloride reverse bio mechanism today monomer protective conducting jet air goal ibuprofen fieser corrosion liquid factor anodic calorimeter polymers 

alkalinity smart certain uv calorific limitation thermoplastic wet biodegradable classification butadiene definition potable nernst overview powder advantage ultimate method specialty hydrogenation visible spraying electrochemistry day cell pemfc recycling crevice transition system 

plasma future fuel advanced efficiency calculation hardness pilling polymerization thermal water secondary synthesis spectroscopy industrial stress polyurethane technology bomb parameter acid instrumentation electrostatic corrosion 

dry 

In [1017]:
empty

'tg biodiesel application traditional analysis science influencing principle petrol liquefied chemical life metallic value primary composition required afc refining importance intergranular pitting chemistry coating dialysis cathodic polypropylene plastic rule nonmetallic hydrogen bedworth coal dissolved combustion numerical chloride reverse bio mechanism today monomer protective conducting jet air goal ibuprofen fieser corrosion liquid factor anodic calorimeter polymersalkalinity smart certain uv calorific limitation thermoplastic wet biodegradable classification butadiene definition potable nernst overview powder advantage ultimate method specialty hydrogenation visible spraying electrochemistry day cell pemfc recycling crevice transition systemplasma future fuel advanced efficiency calculation hardness pilling polymerization thermal water secondary synthesis spectroscopy industrial stress polyurethane technology bomb parameter acid instrumentation electrostatic corrosiondry pathway 

## Preprocess Syllabus

In [1018]:
useful_pos = ['NOUN', 'PROPN', 'ADJ', 'VERB']
def preprocess_syllabus(syllabus):
    #print(syllabus)
    syllabus = "".join(syllabus)
    #print(syllabus)
    syllabus = syllabus.lower()
    doc = nlp(syllabus)
    tagged = [word for word in doc if word.pos_ in useful_pos]
    tagged_str = []
    for each in tagged:
        tagged_str.append(each.orth_)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in tagged_str]
    lemmatized_words = set(lemmatized_words)
    subject_text = " ".join(lemmatized_words)
    #print(subject_text)
    rake = Rake(include_repeated_phrases=False)
    rake.extract_keywords_from_text(subject_text)
    keywords = rake.get_ranked_phrases()
    #print(keywords)
    final = ""
    for keyword in keywords:
        final += keyword
    return final

In [1079]:
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [1082]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sadam\.conda\envs\spark\lib\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sadam\.conda\envs\spark\lib\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [1092]:
from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()

corpus = []
for i in range(0,len(subjects['syllabus'])):
    review = re.sub('[^a-zA-Z0-9]',' ', subjects['syllabus'][i])
    review = review.lower()
    review = review.split()

    review = [lem.lemmatize(word) for word in review if word not in stopwords.words('english') or word in ['not','never']]
    review = ' '.join(review)
    corpus.append(review)

In [1093]:
subjects['processed_syllabus'] = pd.DataFrame(corpus)

In [1094]:
subjects

Unnamed: 0,title,syllabus,extracted_keywords,processed_syllabus
0,chem,chemistry in day- today life: overview of role...,tg biodiesel application traditional analysis ...,chemistry day today life overview role chemist...
1,cn,fundamentals of networking and data communicat...,domain ftp control snmp cyclic window digital ...,fundamental networking data communication anal...
2,dbms,dbms vs file systems database system architect...,domain 3nf deleting control shadow failure par...,dbms v file system database system architectur...
3,dmgt,set theory: sets combinations of sets venn dia...,extended pigeonhole eulerian inverse shortest ...,set theory set combination set venn diagram fi...
4,fds,introduction to data structures: data data obj...,case finding order major algorithm notation in...,introduction data structure data data object a...
5,mech,introduction and operations with forces: intro...,k composition kinematics velocity applicable a...,introduction operation force introduction engi...
6,oop,introduction to object oriented programming (o...,ambiguity io algorithm introduction access var...,introduction object oriented programming oop i...
7,os,operating system objectives and its evolution ...,reference consumer fifo partitioning algorithm...,operating system objective evolution operating...
8,aies,introduction to artificial intelligence and se...,uncertain application mycin set machine minima...,introduction artificial intelligence search st...
9,bdt,fundamentals and basics of big data:introducti...,modeling j ecos read analytics theorem circula...,fundamental basic big data introduction big da...


In [1120]:
subjects['extracted_keywords'][13]

'sop combinational computer key binary application set machine cisc twisted hierarchy multiplication block cpu shift principle sequential k memory risc bit cascading signed data system subractor processor ripple 1â € bus po gate half sign full diagramcache decoder pipelining hardwired user parallel digital characteristic counter subtraction map algebra modulus programmed representation concept conversion look instruction flip fundamental structure study visible mode subtractor cell unsigned microregister addressing adder type von alu number implementing architecture element ia status minimization cycle using boolean hazard ™ harvard complement organization generator logic design synchronousunit division function ring flow organizationcircuit introduction neumann flop addition operationscase arithmetic systemmagnitude multiplexer cyclememorycontrol carry'

In [1019]:
preprocess_syllabus(subjects['syllabus'][0])

'tg biodiesel application traditional analysis science influencing principle petrol liquefied chemical life metallic value primary composition required afc refining importance intergranular pitting chemistry coating dialysis cathodic polypropylene plastic rule nonmetallic hydrogen bedworth coal dissolved combustion numerical chloride reverse bio mechanism today monomer protective conducting jet air goal ibuprofen fieser corrosion liquid factor anodic calorimeter polymersalkalinity smart certain uv calorific limitation thermoplastic wet biodegradable classification butadiene definition potable nernst overview powder advantage ultimate method specialty hydrogenation visible spraying electrochemistry day cell pemfc recycling crevice transition systemplasma future fuel advanced efficiency calculation hardness pilling polymerization thermal water secondary synthesis spectroscopy industrial stress polyurethane technology bomb parameter acid instrumentation electrostatic corrosiondry pathway 

In [1020]:
subjects['extracted_keywords'] = subjects['syllabus'].apply(preprocess_syllabus)

In [1021]:
subjects

Unnamed: 0,title,syllabus,extracted_keywords
0,chem,chemistry in day- today life: overview of role...,tg biodiesel application traditional analysis ...
1,cn,fundamentals of networking and data communicat...,domain ftp control snmp cyclic window digital ...
2,dbms,dbms vs file systems database system architect...,domain 3nf deleting control shadow failure par...
3,dmgt,set theory: sets combinations of sets venn dia...,extended pigeonhole eulerian inverse shortest ...
4,fds,introduction to data structures: data data obj...,case finding order major algorithm notation in...
5,mech,introduction and operations with forces: intro...,k composition kinematics velocity applicable a...
6,oop,introduction to object oriented programming (o...,ambiguity io algorithm introduction access var...
7,os,operating system objectives and its evolution ...,reference consumer fifo partitioning algorithm...
8,aies,introduction to artificial intelligence and se...,uncertain application mycin set machine minima...
9,bdt,fundamentals and basics of big data:introducti...,modeling j ecos read analytics theorem circula...


In [1022]:
subjects.to_json('preprocessed_subjects.json')

In [1023]:
subjects

Unnamed: 0,title,syllabus,extracted_keywords
0,chem,chemistry in day- today life: overview of role...,tg biodiesel application traditional analysis ...
1,cn,fundamentals of networking and data communicat...,domain ftp control snmp cyclic window digital ...
2,dbms,dbms vs file systems database system architect...,domain 3nf deleting control shadow failure par...
3,dmgt,set theory: sets combinations of sets venn dia...,extended pigeonhole eulerian inverse shortest ...
4,fds,introduction to data structures: data data obj...,case finding order major algorithm notation in...
5,mech,introduction and operations with forces: intro...,k composition kinematics velocity applicable a...
6,oop,introduction to object oriented programming (o...,ambiguity io algorithm introduction access var...
7,os,operating system objectives and its evolution ...,reference consumer fifo partitioning algorithm...
8,aies,introduction to artificial intelligence and se...,uncertain application mycin set machine minima...
9,bdt,fundamentals and basics of big data:introducti...,modeling j ecos read analytics theorem circula...


In [1124]:
subjects['extracted_keywords'][13]

'sop combinational computer key binary application set machine cisc twisted hierarchy multiplication block cpu shift principle sequential k memory risc bit cascading signed data system subractor processor ripple 1â € bus po gate half sign full diagramcache decoder pipelining hardwired user parallel digital characteristic counter subtraction map algebra modulus programmed representation concept conversion look instruction flip fundamental structure study visible mode subtractor cell unsigned microregister addressing adder type von alu number implementing architecture element ia status minimization cycle using boolean hazard ™ harvard complement organization generator logic design synchronousunit division function ring flow organizationcircuit introduction neumann flop addition operationscase arithmetic systemmagnitude multiplexer cyclememorycontrol carry'

## Model

In [1111]:
subjects

Unnamed: 0,title,syllabus,extracted_keywords,processed_syllabus
0,chem,chemistry in day- today life: overview of role...,tg biodiesel application traditional analysis ...,chemistry day today life overview role chemist...
1,cn,fundamentals of networking and data communicat...,domain ftp control snmp cyclic window digital ...,fundamental networking data communication anal...
2,dbms,dbms vs file systems database system architect...,domain 3nf deleting control shadow failure par...,dbms v file system database system architectur...
3,dmgt,set theory: sets combinations of sets venn dia...,extended pigeonhole eulerian inverse shortest ...,set theory set combination set venn diagram fi...
4,fds,introduction to data structures: data data obj...,case finding order major algorithm notation in...,introduction data structure data data object a...
5,mech,introduction and operations with forces: intro...,k composition kinematics velocity applicable a...,introduction operation force introduction engi...
6,oop,introduction to object oriented programming (o...,ambiguity io algorithm introduction access var...,introduction object oriented programming oop i...
7,os,operating system objectives and its evolution ...,reference consumer fifo partitioning algorithm...,operating system objective evolution operating...
8,aies,introduction to artificial intelligence and se...,uncertain application mycin set machine minima...,introduction artificial intelligence search st...
9,bdt,fundamentals and basics of big data:introducti...,modeling j ecos read analytics theorem circula...,fundamental basic big data introduction big da...


In [1112]:
subjects['extracted_keywords'][11]

'node express php xml mern jquery react j ajax bootstrap html dom wordpress mongo api cs'

In [1088]:
X = subjects['processed_syllabus']
y = subjects['title']

In [1095]:
yt = pd.read_json('yt_preprocessed.json')

In [1096]:
yt['combo'][1]

'Scrapy for Beginners - A Complete How To Example Web Scraping Project # DISCORD : Scrapy for Beginners! This python tutorial is aimed at people new to Scrapy. We cover crawling with a basic spider an create a complete tutorial project, including exporting to a JSON file. We scrape products from a online shop and get names and prices. Learn how to use the Scrapy shell to parse the data, and get text and "href" attributes from the HTML, as well as scraping multiple pages. This is a full how to from start to finish for your first Scrapy spider project, all in Python 3.code: # Proxies: # Patreon:  # The Scraper API I use: # Donate: # Hosting: Digital Ocean  - # Gear Used: DISCLAIMER This contains affiliate links. If you use these links to buy something we may earn a commission.'

In [1028]:
yt

Unnamed: 0,title,description,combo,views,likes,tag,url,duration,publisedAt
0,scrapy course – python web scraping for beginners,The Scrapy Beginners Course will teach you eve...,serhiy first agustn website start extraction w...,436714,9886,,https://www.youtube.com/watch?v=mBoX_JCKZTE,16629,1682606288000
1,scrapy for beginners - a complete how to examp...,# DISCORD (NEW): https://discord.gg/C4J2uckpbR...,hosting contains complete people multiple fini...,267940,5091,scrapy python tutorial scrapy crawlspider scra...,https://www.youtube.com/watch?v=s4jtkzHhLzY,1402,1607540409000
2,web scraping using scrapy | scrapy tutorial + ...,Try Atlas: https://www.mongodb.com/cloud/atlas...,1 html read java javascript hr ]: complete scr...,46067,1012,,https://www.youtube.com/watch?v=GogxAQ2JP4A,3003,1687347004000
3,coding web crawler in python with scrapy,Today we learn how to build a professional web...,code professional discord coding bible scrapin...,111544,2287,python web crawler web crawling python web cra...,https://www.youtube.com/watch?v=m_3gjHGxIJc,2071,1669210212000
4,python scrapy tutorial for beginners,This Python Scrapy tutorial for beginners that...,basic first find scope claim information glanc...,102,5,python scrapy tutorial for beginners python sc...,https://www.youtube.com/watch?v=41opDqo1im8,7140,1679465239000
...,...,...,...,...,...,...,...,...,...
595,standard template library (stl) in c++ | intr...,Standard Template Library (STL) mainly compose...,49 library video related c ++ iterators standa...,52898,644,stl in c++ stl in c++ tutorial stl in c++ tuto...,https://www.youtube.com/watch?v=fmKNmoZxiVU,424,1516724749000
596,the c++ standard template library (stl) | c++ ...,The Standard Template Library (STL) is a set o...,best click javascript website oriented course ...,252714,9881,,https://www.youtube.com/watch?v=c9iREsYpayk,889,1599896391000
597,stl templates in c++ - generic functions and c...,"In this video, we will learn about STL templat...",44 related start videoadd use text course powe...,16903,643,c++ stl templates what are c++ templates c++ s...,https://www.youtube.com/watch?v=-Svq5IYPWbc,1147,1709046132000
598,c++ templates: must for competitive programmin...,"Download the best IDE for C, C# and C++: https...",way hosting competitive science english comple...,364110,11354,,https://www.youtube.com/watch?v=kKJeekDKU30,796,1599489366000


### Vectorizer

1. BOW
2. n-grams
3. Word2Vec

### Classifier

1. Naive Bayes
2. Random Forest

## 1. BOW

In [1030]:
from sklearn.feature_extraction.text import CountVectorizer

In [1031]:
bow = CountVectorizer()

Fit transform on the keywords

In [1032]:
X

0     tg biodiesel application traditional analysis ...
1     domain ftp control snmp cyclic window digital ...
2     domain 3nf deleting control shadow failure par...
3     extended pigeonhole eulerian inverse shortest ...
4     case finding order major algorithm notation in...
5     k composition kinematics velocity applicable a...
6     ambiguity io algorithm introduction access var...
7     reference consumer fifo partitioning algorithm...
8     uncertain application mycin set machine minima...
9     modeling j ecos read analytics theorem circula...
10    qualitative modeling implementation discretiza...
11    node express php xml mern jquery react j ajax ...
12    administrative cia anonymizers control symmetr...
13    sop combinational computer key binary applicat...
14    stage task cache branch pipelining handling sf...
15    queue direct trie b traversal perfect dikjtras...
16    encoding direct recursive counter mathematical...
17    exponential testing poisson f statistic th

In [1033]:
vectors = bow.fit_transform(X).toarray()

Next, transform on the video keywords

In [1034]:
yt['combo'][0]

'serhiy first agustn website start extraction web agent conclusion proxy course header jedi joe wcislo deploying part maneerat run crawling introduction step plan fake doc user scheduling learn github comparison kalinets cleaning database hundred resource smartproxy project scrapy build scheduler monitor file python setup developer justin job thanks data free programming sith code read mysql scraping aggregator cloud bannedkussrow csv morgan next virtual postgres supporter pipeline scraper beginner get tool guide getting scrapyd hual environmentsenv avoid playbook browser article item resourcessponsor davthecoder nattira apis teach proxiesdiscovery rotating champion kearney scrapydweb databasesneed scrapycreating saving heather scale blockedcreated content scrapeops spider usingotis spiderpipelines'

In [1035]:
yt['title'][0]

'scrapy course – python web scraping for beginners'

In [1036]:
video_vector = bow.transform([yt['title'][0]])

In [1037]:
video_vector

<1x1329 sparse matrix of type '<class 'numpy.int64'>'
	with 0 stored elements in Compressed Sparse Row format>

In [1038]:
video_vector.toarray()

array([[0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [1058]:
video_vectors = bow.transform(yt['title'])

In [1040]:
vectors.shape

(19, 1329)

In [1041]:
video_vector.shape

(1, 1329)

### Random Forest

In [1042]:
from sklearn.ensemble import RandomForestClassifier

In [1043]:
rf = RandomForestClassifier(random_state=42)

In [1044]:
y.shape

(19,)

In [1045]:
rf.fit(vectors, y)

In [1046]:
all_videos = bow.transform(yt['combo'])

In [1047]:
pred = rf.predict(all_videos)

In [1048]:
pred

array(['fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'daa', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd', 'fsd',
       'fsd', 'fsd',

### Multinomial Naive Bayes

In [1065]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer

In [1066]:
nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])

In [1067]:
nb.fit(X, y)

In [1102]:
model = MultinomialNB().fit(vectors,y)

In [1109]:
yt['combo_processed'][5]

'c language tutorial beginner join new web development batch using link delta 3 0 early bird offer first 5000 student international student detail delta 3 0 batch start date 7th november 2023 alternate day lecture duration 4 5 month class duration 1 5 hour 2hour access course 2 year complete frontend development html cs javascript reactjs complete backend development complete database complete mern stack real life industry grade project live mentorship session 500 topic video certificate given completion join community code note practice sheet timestamps introduction 00 00 00installation 00 01 27compiler setup 00 04 31chapter 1 variable data type input output 00 10 12chapter 2 instruction operator 00 51 54chapter 3 conditional statement 02 01 15chapter 4 loop control statement 02 45 36 chapter 5 function recursion 03 51 34chapter 6 pointer 05 21 30chapter 7 array 06 18 50chapter 8 string 07 22 29chapter 9 structure 08 26 20chapter 10 file 09 22 15chapter 11 dynamic memory allocation 10

In [1116]:
tfidf.transform(['html'])

<1x1341 sparse matrix of type '<class 'numpy.float64'>'
	with 1 stored elements in Compressed Sparse Row format>

synchronousunit division function

In [1127]:
model.predict(tfidf.transform(['synchronousunit division function']))

array(['oop'], dtype='<U4')

In [1075]:
yt['title'][1]

'scrapy for beginners - a complete how to example web scraping project'

In [1073]:
nb.predict(yt['title'])

array(['ads', 'daa', 'bdt', 'dmgt', 'ads', 'aies', 'oop', 'aies', 'aies',
       'aies', 'bdt', 'fds', 'ads', 'oop', 'fds', 'sepm', 'ddca', 'sepm',
       'fds', 'daa', 'daa', 'daa', 'ads', 'bdt', 'ads', 'ads', 'ads',
       'ads', 'ads', 'ads', 'fds', 'fds', 'fds', 'oop', 'fds', 'ads',
       'fds', 'ads', 'ads', 'fds', 'os', 'os', 'os', 'os', 'os', 'os',
       'os', 'os', 'os', 'os', 'os', 'os', 'os', 'os', 'os', 'ads', 'mmc',
       'os', 'os', 'os', 'daa', 'daa', 'de', 'fds', 'ddca', 'oop', 'oop',
       'oop', 'oop', 'oop', 'de', 'mech', 'de', 'de', 'de', 'mech', 'ads',
       'mech', 'mech', 'de', 'de', 'dmgt', 'dmgt', 'dmgt', 'dmgt', 'oop',
       'dmgt', 'ads', 'dmgt', 'aies', 'mmc', 'dmgt', 'dmgt', 'dmgt',
       'ics', 'dmgt', 'ads', 'dmgt', 'ads', 'dmgt', 'mmc', 'os', 'os',
       'mmc', 'mmc', 'mmc', 'mmc', 'mmc', 'mmc', 'mmc', 'ads', 'os',
       'oop', 'mmc', 'os', 'os', 'os', 'os', 'fds', 'os', 'os', 'os',
       'os', 'daa', 'ps', 'os', 'os', 'os', 'os', 'os', 'os', 'o

In [1060]:
model.fit(vectors, y)

In [1061]:
model.predict(video_vector)

array(['os'], dtype='<U4')

In [1064]:
yt['title'][1]

'scrapy for beginners - a complete how to example web scraping project'

In [1063]:
model.predict(video_vectors)

array(['ads', 'oop', 'bdt', 'dmgt', 'ads', 'aies', 'oop', 'aies', 'aies',
       'aies', 'ddca', 'fds', 'ads', 'oop', 'fds', 'sepm', 'ddca', 'sepm',
       'fds', 'daa', 'daa', 'bdt', 'ads', 'bdt', 'ads', 'ads', 'mmc',
       'mmc', 'mmc', 'fds', 'fds', 'fds', 'fds', 'oop', 'fds', 'ads',
       'fds', 'oop', 'ads', 'fds', 'os', 'os', 'os', 'os', 'os', 'os',
       'os', 'os', 'os', 'os', 'os', 'os', 'os', 'os', 'os', 'ads', 'mmc',
       'os', 'os', 'os', 'daa', 'daa', 'de', 'fds', 'oop', 'oop', 'oop',
       'oop', 'oop', 'oop', 'de', 'mech', 'de', 'de', 'de', 'mech', 'ads',
       'mech', 'mech', 'de', 'de', 'dmgt', 'dmgt', 'dmgt', 'dmgt', 'oop',
       'dmgt', 'ads', 'dmgt', 'aies', 'mmc', 'dmgt', 'dmgt', 'dmgt',
       'sepm', 'dmgt', 'ads', 'dmgt', 'ads', 'dmgt', 'mmc', 'os', 'os',
       'mmc', 'mmc', 'mmc', 'mmc', 'mmc', 'mmc', 'mmc', 'mmc', 'os',
       'oop', 'mmc', 'mmc', 'os', 'os', 'os', 'fds', 'os', 'os', 'os',
       'os', 'daa', 'daa', 'os', 'os', 'os', 'os', 'os', 'os',

### Catboost Classifier

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb = CatBoostClassifier(iterations=100, depth=6, learning_rate=0.1, loss_function='MultiClass')

In [None]:
X

In [None]:
y

In [None]:
cb.fit(vectors, y)

In [None]:
video_vectors.shape

In [None]:
video_vector.shape

In [None]:
pred = cb.predict(video_vectors)

In [None]:
pd.DataFrame(pred).value_counts()

- Used TF-IDF, fit_transform on the keywords --> vectorizer
- Used vectorizer's transform on the video title+desc --> video_vectors
- model.fit(video_vectors, labels) --> labels are the subjects
- new_video_vector = vectorizer.transform(new_video)
- subject_prediction = model.predict(new_video_vector)

## 2. TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [1089]:
tfidf = TfidfVectorizer()

In [1097]:
vectors = tfidf.fit_transform(X).toarray()

In [None]:
tfidf.get_feature_names_out()[:50]

In [None]:
pd.DataFrame(vectors, columns=tfidf.get_feature_names_out())

In [None]:
vectors

In [None]:
video_vector = tfidf.transform([yt['combo'][0]])

In [1098]:
video_vectors = tfidf.transform(yt['combo'])

In [None]:
rf.fit(vectors, y)

In [None]:
pred = rf.predict(video_vectors)

In [None]:
pred

## 3. N-Grams

In [None]:
ng = CountVectorizer(ngram_range=(1, 3))

In [None]:
vectors = ng.fit_transform(X).toarray()

In [None]:
video_vector = ng.transform([yt['combo'][0]])

In [None]:
rf.fit(vectors, y)

In [None]:
rf.predict(video_vector)