In [84]:
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.neighbors import NearestNeighbors

In [85]:
# importar datos
# comments = pd.read_csv('Comments.csv')
courses = pd.read_csv('C:/Users/G/Documents/GitHub/recommendation_system/Course_info.csv')

In [87]:
# se eliminan las columnas que no interesan
# comments_filtered= comments.drop(labels=['date', 'display_name', 'comment'], axis=1)
column_keep = ['category', 'subcategory','instructor_name', 'is_paid', 'price']
column_delete = []
for column in courses.columns:
    if column not in column_keep:
        column_delete.append(column)
courses_filtered = courses.drop(labels=column_delete, axis=1)

In [81]:
courses.columns

Index(['id', 'title', 'is_paid', 'price', 'headline', 'num_subscribers',
       'avg_rating', 'num_reviews', 'num_comments', 'num_lectures',
       'content_length_min', 'published_time', 'last_update_date', 'category',
       'subcategory', 'topic', 'language', 'course_url', 'instructor_name',
       'instructor_url'],
      dtype='object')

In [88]:
# comprueba la cantidad de valores nulos
print(courses_filtered.isnull().sum())
# crea una lista vacia para las columna con valores nulos
column_dropna=[]
# itera a traves de un dict nom_colum:cant_nulos y guarda el nombre de la columa si tiene nulos
for key, value in courses_filtered.isnull().sum().to_dict().items():
    if value !=0: 
        column_dropna.append(key)
# elina las filas que contienen nulos 
courses_filtered.dropna(axis= 0, how='any', subset=column_dropna, inplace= True)

is_paid            0
price              0
category           0
subcategory        0
instructor_name    5
dtype: int64


In [90]:
# pipeline para variables numericas
numeric_pipe = Pipeline([
    ('scaler', StandardScaler())
    ])

# pipeline para variables categoricas
categorical_pipe = Pipeline([
    ('encoder', OneHotEncoder(drop = 'first'))
    ])

# contiene las transformaciones
col_transf = ColumnTransformer(transformers=[
    #('numeric', numeric_pipe, courses_filtered._get_numeric_data().columns.tolist()),
    ('categoric', categorical_pipe, courses_filtered.select_dtypes('object').columns.tolist())
    ])

col_transf_fit = col_transf.fit(courses_filtered)
courses_filtered_transf = col_transf_fit.transform(courses_filtered)

nneighbors = NearestNeighbors(n_neighbors = 10, metric = 'cosine').fit(courses_filtered_transf)
# recomendaciones para las 10 primeras posiciones
ind = nneighbors.kneighbors(courses_filtered_transf[:100], return_distance=False)

In [100]:
#ejemplo de recomendacion
print("Curso visto")
print("-"*80)
print(courses.loc[ind[0][1], :])
print()
print("Cursos recomendados")
print("-"*80)
courses_reco = courses_filtered.loc[ind[0][1:], :].join(courses, how='inner', lsuffix='_filtered')
column_keep = ['id', 'title', 'category', 'subcategory', 
               'instructor_name', 'is_paid', 'price', 'topic', 
               'language', 'course_url']
column_delete = []
for column in courses_reco:
    if column not in column_keep:
        column_delete.append(column)
courses_reco.drop(column_delete, axis=1, inplace=True)
courses_reco.head()

Curso visto
--------------------------------------------------------------------------------
id                                                              26438.0
title                             Happy Vegetarian Kids Cooking Healthy
is_paid                                                            True
price                                                             49.99
headline              Children learn how to be healthy and cook some...
num_subscribers                                                   196.0
avg_rating                                                         3.95
num_reviews                                                        19.0
num_comments                                                        6.0
num_lectures                                                       21.0
content_length_min                                                216.0
published_time                                     2012-10-11T13:22:39Z
last_update_date                           

Unnamed: 0,id,title,is_paid,price,category,subcategory,topic,language,course_url,instructor_name
431,26438.0,Happy Vegetarian Kids Cooking Healthy,True,49.99,Lifestyle,Food & Beverage,Cooking,English,/course/happy-kids-cooking-healthy-part-1/,Angela Poch
129102,3748394.0,Pelajari D5 Render dalam Bahasa Indonesia,False,0.0,Design,Architectural Design,Photorealistic Rendering,Indonesian,/course/learn-d5-render-in-bahasa-indonesia/,D5 Render Team
158163,4202238.0,350-401 ~ CCNP Enterprise ENCOR Core Practice ...,True,19.99,IT & Software,IT Certifications,CCNP Enterprise,English,/course/cisco-enterprise-network-and-core-tech...,Rifat Academy
131379,3786138.0,Curso de GNU Octave - Nível Intermediário,True,79.9,IT & Software,Other IT & Software,GNU,Portuguese,/course/curso-de-octave-intermediario/,"Conecta, que eu te ensino!"
38130,1539176.0,React Fiber v16 Essentials,True,49.99,Development,Web Development,React JS,English,/course/react-fiber-v16-essentials/,Guy Ziv
