#Course recommendation system using cosine similarity

#Algo
+ Cosine similarity
+ Linear similarity

#Workflow
+ Dataset
+ Vectorized the dataset
+ Cosine similarity matrix
+ ID score
+ Recommend

In [6]:
#import EDA packages
import pandas as pd
import neattext.functions as nfx


In [7]:
#import ML/RC packages
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity,linear_kernel

In [29]:
df = pd.read_json("./custom_courses_with_descriptions.json")
data=df
newrow={
    "name":"predict", 
    "description":"From a young age, I have been deeply fascinated by problem-solving and the logic behind it. This curiosity led me to discover the world of programming, where I found great satisfaction in creating solutions through code. My passion for reading further fuels my desire to constantly learn and expand my knowledge. With a strong interest in computing and engineering, I aspire to study computer science to deepen my understanding of technology and how it can be leveraged to address complex challenges. My goal is to develop innovative solutions that can make a difference in the world."
}

df.loc[len(df)] = newrow
# df.add({
#     "name":"predict",
#     "description":"Just info about me"
# })
print(df)

                                   name  \
0                           Accountancy   
1                            Accounting   
2                  Accounting Education   
3                Accounting and Finance   
4                     Actuarial Science   
..                                  ...   
614          Web Design and Development   
615  Wildlife and Ecological Management   
616                 Wildlife Management   
617                             Zoology   
618                             predict   

                                           description  
0    Accountancy is the practice of recording, clas...  
1    Accounting is the systematic recording, report...  
2    Accounting Education combines accounting princ...  
3    Accounting and Finance is an integrated course...  
4    Actuarial Science is the discipline of assessi...  
..                                                 ...  
614  Web Design and Development involves creating a...  
615  Wildlife and Ecologica

In [30]:
df["description"]

0      Accountancy is the practice of recording, clas...
1      Accounting is the systematic recording, report...
2      Accounting Education combines accounting princ...
3      Accounting and Finance is an integrated course...
4      Actuarial Science is the discipline of assessi...
                             ...                        
614    Web Design and Development involves creating a...
615    Wildlife and Ecological Management focuses on ...
616    Wildlife Management involves the study and man...
617    Zoology is the scientific study of animals, in...
618    From a young age, I have been deeply fascinate...
Name: description, Length: 619, dtype: object

In [33]:
#Clean text: remove stopwords, special character
df["clean_course_description"]=df["description"].apply(nfx.remove_stopwords)
df["clean_course_description"]=df["clean_course_description"].apply(nfx.remove_special_characters)  
df.head()

Unnamed: 0,name,description,clean_course_description
0,Accountancy,"Accountancy is the practice of recording, clas...",Accountancy practice recording classifying ana...
1,Accounting,"Accounting is the systematic recording, report...",Accounting systematic recording reporting anal...
2,Accounting Education,Accounting Education combines accounting princ...,Accounting Education combines accounting princ...
3,Accounting and Finance,Accounting and Finance is an integrated course...,Accounting Finance integrated course combines ...
4,Actuarial Science,Actuarial Science is the discipline of assessi...,Actuarial Science discipline assessing financi...


In [34]:
#Vectorization of text
count_vector=CountVectorizer()
cv_matrix=count_vector.fit_transform(df["clean_course_description"])

In [35]:
#sparse
cv_matrix

<619x1690 sparse matrix of type '<class 'numpy.int64'>'
	with 12456 stored elements in Compressed Sparse Row format>

In [36]:
#dense
cv_matrix.todense()

matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 1, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [38]:
df_cv_words=pd.DataFrame(cv_matrix.todense(),columns=count_vector.get_feature_names_out())

In [39]:
df_cv_words.head()

Unnamed: 0,3d,abilities,abroad,academia,academic,accountancy,accounting,accuracy,accurate,achieve,...,written,xrays,years,yeast,yoruba,young,youth,zoological,zoology,zoos
0,0,0,0,0,0,1,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,6,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [40]:
#cosine similarity
cosine_sim_matrix=cosine_similarity(cv_matrix)

In [41]:
cosine_sim_matrix

array([[1.        , 0.76764947, 0.54756294, ..., 0.05330018, 0.        ,
        0.04478111],
       [0.76764947, 1.        , 0.49182219, ..., 0.08183171, 0.03636965,
        0.03055662],
       [0.54756294, 0.49182219, 1.        , ..., 0.        , 0.        ,
        0.04403677],
       ...,
       [0.05330018, 0.08183171, 0.        , ..., 1.        , 0.16666667,
        0.035007  ],
       [0.        , 0.03636965, 0.        , ..., 0.16666667, 1.        ,
        0.046676  ],
       [0.04478111, 0.03055662, 0.04403677, ..., 0.035007  , 0.046676  ,
        1.        ]])

In [42]:
#Get Course descriptino ID/Index
course_indices=pd.Series(df.index,index=df["description"])
course_indices

description
Accountancy is the practice of recording, classifying, and analyzing financial transactions. Students in this course will learn about financial reporting, auditing, taxation, and management accounting. They'll develop skills in preparing and interpreting financial statements, understanding accounting principles and standards, and using accounting software. The curriculum typically covers topics such as cost accounting, corporate finance, business law, and ethics in accounting. Graduates are prepared for careers in public accounting firms, corporations, government agencies, and non-profit organizations.                             0
Accounting is the systematic recording, reporting, and analysis of financial transactions. This course provides a comprehensive understanding of accounting principles, financial reporting, and auditing practices. Students will learn how to prepare and interpret financial statements, manage budgets, and analyze business performance. The curriculum

In [43]:
course_indices["From a young age, I have been deeply fascinated by problem-solving and the logic behind it. This curiosity led me to discover the world of programming, where I found great satisfaction in creating solutions through code. My passion for reading further fuels my desire to constantly learn and expand my knowledge. With a strong interest in computing and engineering, I aspire to study computer science to deepen my understanding of technology and how it can be leveraged to address complex challenges. My goal is to develop innovative solutions that can make a difference in the world."]

618

In [44]:
idx=course_indices["From a young age, I have been deeply fascinated by problem-solving and the logic behind it. This curiosity led me to discover the world of programming, where I found great satisfaction in creating solutions through code. My passion for reading further fuels my desire to constantly learn and expand my knowledge. With a strong interest in computing and engineering, I aspire to study computer science to deepen my understanding of technology and how it can be leveraged to address complex challenges. My goal is to develop innovative solutions that can make a difference in the world."]

In [45]:
idx

618

In [46]:
scores=list(enumerate(cosine_sim_matrix[idx]))
scores

[(0, 0.0447811075519899),
 (1, 0.030556616567607043),
 (2, 0.04403677137750769),
 (3, 0.02545963789141836),
 (4, 0.032338083338177726),
 (5, 0.04332835888138613),
 (6, 0.04332835888138613),
 (7, 0.03969420930187223),
 (8, 0.04332835888138613),
 (9, 0.03969420930187223),
 (10, 0.01366533436151313),
 (11, 0.042008402520840295),
 (12, 0.013797369644231547),
 (13, 0.014598929061844594),
 (14, 0.06185402664457098),
 (15, 0.05132143194267069),
 (16, 0.05020964452534338),
 (17, 0.030556616567607043),
 (18, 0.04309971159206156),
 (19, 0.03883678186903086),
 (20, 0.03642590462960279),
 (21, 0.0835998456190943),
 (22, 0.0750628915185075),
 (23, 0.02658174076096333),
 (24, 0.07380124046167459),
 (25, 0.08160433493741086),
 (26, 0.05746628212274874),
 (27, 0.025251800635952337),
 (28, 0.028583097523751475),
 (29, 0.024562537539453734),
 (30, 0.0844400661841498),
 (31, 0.0895622151039798),
 (32, 0.07767356373806172),
 (33, 0.060752679940508494),
 (34, 0.058080912249204274),
 (35, 0.0551597348514691

In [47]:
#Sort score per cosine score
sorted_scores=sorted(scores, key=lambda x:x[1], reverse=True)

In [49]:
#Omit the first value
sorted_scores[1:]

[(169, 0.21004201260420144),
 (219, 0.2045239970259654),
 (226, 0.1886388194727034),
 (112, 0.18581403127845014),
 (469, 0.18077538151554678),
 (109, 0.17712297710801905),
 (416, 0.17149858514250885),
 (111, 0.16477051091432693),
 (114, 0.16169041669088863),
 (113, 0.14598929061844593),
 (533, 0.140028008402801),
 (117, 0.13231403100624076),
 (115, 0.13001274187308592),
 (454, 0.1284985132406471),
 (110, 0.12574886370696423),
 (370, 0.12524485821702988),
 (595, 0.12222646627042817),
 (256, 0.11764705882352938),
 (263, 0.11764705882352938),
 (230, 0.11669000700233413),
 (71, 0.10958925093990116),
 (371, 0.10934350421017383),
 (520, 0.10846522890932808),
 (144, 0.10585122480499261),
 (365, 0.10555008273018726),
 (374, 0.10212592127570835),
 (479, 0.10188534162169866),
 (557, 0.10188534162169866),
 (90, 0.10059909096557137),
 (584, 0.09901475429766743),
 (129, 0.09901475429766741),
 (285, 0.09335200560186731),
 (135, 0.09208185110322616),
 (223, 0.09086217008485092),
 (123, 0.089689705866

In [50]:
#Selected courses indices
selected_courses_indices=[i[0] for i in sorted_scores[1:]]

In [51]:
selected_courses_indices

[169,
 219,
 226,
 112,
 469,
 109,
 416,
 111,
 114,
 113,
 533,
 117,
 115,
 454,
 110,
 370,
 595,
 256,
 263,
 230,
 71,
 371,
 520,
 144,
 365,
 374,
 479,
 557,
 90,
 584,
 129,
 285,
 135,
 223,
 123,
 31,
 439,
 297,
 314,
 319,
 350,
 381,
 61,
 391,
 42,
 30,
 164,
 65,
 21,
 154,
 91,
 116,
 354,
 25,
 468,
 103,
 48,
 362,
 283,
 82,
 460,
 32,
 100,
 118,
 162,
 315,
 372,
 22,
 470,
 551,
 67,
 24,
 292,
 38,
 68,
 477,
 480,
 483,
 525,
 565,
 589,
 590,
 247,
 55,
 265,
 211,
 153,
 510,
 532,
 215,
 79,
 72,
 415,
 458,
 611,
 221,
 218,
 273,
 530,
 142,
 73,
 306,
 310,
 518,
 524,
 602,
 198,
 157,
 59,
 302,
 14,
 89,
 430,
 574,
 33,
 102,
 473,
 41,
 80,
 52,
 43,
 96,
 98,
 447,
 34,
 26,
 357,
 147,
 363,
 382,
 576,
 56,
 35,
 375,
 428,
 278,
 288,
 131,
 422,
 436,
 421,
 15,
 214,
 87,
 54,
 373,
 16,
 57,
 108,
 40,
 161,
 282,
 284,
 377,
 101,
 124,
 130,
 165,
 216,
 254,
 336,
 47,
 175,
 233,
 240,
 280,
 213,
 220,
 231,
 499,
 617,
 60,
 296,
 200,


In [52]:
#Selected courses score
selected_courses_scores=[i[1] for i in sorted_scores[1:]]

In [53]:
selected_courses_scores

[0.21004201260420144,
 0.2045239970259654,
 0.1886388194727034,
 0.18581403127845014,
 0.18077538151554678,
 0.17712297710801905,
 0.17149858514250885,
 0.16477051091432693,
 0.16169041669088863,
 0.14598929061844593,
 0.140028008402801,
 0.13231403100624076,
 0.13001274187308592,
 0.1284985132406471,
 0.12574886370696423,
 0.12524485821702988,
 0.12222646627042817,
 0.11764705882352938,
 0.11764705882352938,
 0.11669000700233413,
 0.10958925093990116,
 0.10934350421017383,
 0.10846522890932808,
 0.10585122480499261,
 0.10555008273018726,
 0.10212592127570835,
 0.10188534162169866,
 0.10188534162169866,
 0.10059909096557137,
 0.09901475429766743,
 0.09901475429766741,
 0.09335200560186731,
 0.09208185110322616,
 0.09086217008485092,
 0.08968970586617497,
 0.0895622151039798,
 0.0895622151039798,
 0.08759357437106756,
 0.08759357437106756,
 0.08759357437106756,
 0.08759357437106756,
 0.08759357437106756,
 0.08665671776277226,
 0.08574929257125442,
 0.08553571990445116,
 0.08444006618414

In [57]:
df["description"].iloc[selected_courses_indices]

169    Education Computer Science combines the study ...
219    Electrical/Computer Engineering combines the s...
226    Electronics and Computer Engineering combines ...
112    Computer Science Engineering integrates the pr...
469    Metallurgical and Material Engineering explore...
                             ...                        
607    Urban and Regional Planning involves the devel...
608    Urban Management focuses on the administration...
610    Veterinary Nursing supports the care and treat...
612    Water Resources Management involves the planni...
615    Wildlife and Ecological Management focuses on ...
Name: description, Length: 618, dtype: object