In [126]:
# Date: 21/December/2022 - Wednesday
# Author: Virgilio Murillo Ochoa
# personal github: Virgilio-AI
# linkedin: https://www.linkedin.com/in/virgilio-murillo-ochoa-b29b59203
# contact: data_scientist@virgiliomurillo.com
# web: virgiliomurillo.com

## pyspark.pandas

After comparing all the flavours of manipulating big data ( pyspark, koalas, pandas )
I decided to use pyspark.pandas because it is the most simple and easy to use
and it is the most similar to pandas
also it is up to date with the most recent python releases
and it has incorporated a very convenient interative plotting framework

Import libraries
also suppress warnings

In [127]:
import pyspark.pandas as pd
import time
import datetime
import warnings

In [128]:
warnings.simplefilter("ignore")
warnings.filterwarnings('ignore')

In [129]:
%%javascript
(function(on) {
    const e = $("<a>Setup failed</a>");
    const ns = "js_jupyter_suppress_warnings";
    var cssrules = $("#" + ns);
    if(!cssrules.length)
        cssrules = $("<style id='" + ns + "' type='text/css'>div.output_stderr { } </style>").appendTo("head");
    e.click(function() {
        var s = 'Showing';
        cssrules.empty()
        if(on) {
            s = 'Hiding';
            cssrules.append("div.output_stderr, div[data-mime-type*='.stderr'] { display:none; }");
        }
        e.text(s + ' warnings (click to toggle)');
        on = !on;
    }).click();
    $(element).append(e);
})(true);

<IPython.core.display.Javascript object>

import the data into pyspark.pandas dataframe
giving a convenient 5 seconds for the first one an a 2 second import for the second one
see this table below

In [130]:
#  toCompare | pandas | koalas | pyspark.pandas | pyspark |
#            |--------|--------|----------------|---------|
# time       |18s     |2s      |2s              |2s       |
# convenient |yes     |1/2     |yes             |1/2       |

In [131]:
prev = datetime.datetime.now()
description = pd.read_csv("Course_info.csv")
now = datetime.datetime.now()
print(now - prev)

0:00:00.567450


In [132]:
prev = datetime.datetime.now()
comments = pd.read_csv("Comments.csv")
now = datetime.datetime.now()
print(now - prev)



[Stage 373:>                                                      (0 + 16) / 16]

0:00:02.387021




In [133]:
# change the dataframe data type
prev = datetime.datetime.now()
description['num_subscribers'] = description['num_subscribers'].astype('int64')
description['num_reviews'] = description['num_reviews'].astype('int64')
description['num_comments'] = description['num_comments'].astype('int64')
description['num_lectures'] = description['num_lectures'].astype('int64')
description['num_reviews'] = description['num_reviews'].astype('int64')
now = datetime.datetime.now()
print(now - prev)

0:00:00.054808


In [134]:
description.info()

<class 'pyspark.pandas.frame.DataFrame'>
Int64Index: 209734 entries, 0 to 209733
Data columns (total 20 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   id                  209734 non-null  float64
 1   title               209734 non-null  object 
 2   is_paid             209734 non-null  object 
 3   price               209734 non-null  object 
 4   headline            209707 non-null  object 
 5   num_subscribers     209440 non-null  int64  
 6   avg_rating          209734 non-null  object 
 7   num_reviews         209687 non-null  int64  
 8   num_comments        209715 non-null  int64  
 9   num_lectures        209729 non-null  int64  
 10  content_length_min  209733 non-null  float64
 11  published_time      209733 non-null  object 
 12  last_update_date    209596 non-null  object 
 13  category            209733 non-null  object 
 14  subcategory         209733 non-null  object 
 15  topic               208775 non-

In [135]:
# defining all the functions
def get_instructor_username(instructor_url):
	return str(instructor_url)[6:-2]

def get_curse_name(course_url):
	return str(course_url)[8:-1]

def getFirstName(name):
	if name is None:
		return "None"
	sname = name.split()
	prefix = ['mr.','mrs.','ms.','dr.','prof.','sr.','jr.','.',',','mr','mrs','ms','dr','prof','sr','jr']
	for i in range(len(sname)):
		sname[i] = sname[i].lower()


	if len(sname) > 0 and sname[0] not in prefix:
		return sname[0]
	elif len(sname) > 1 and sname[1] not in prefix:
		return sname[1]
	elif len(sname) > 2 and sname[2] not in prefix:
		return sname[2]

In [136]:
description['instructor_url'] = description['instructor_url'].map(get_instructor_username)

In [137]:
description['course_url'] = description['course_url'].map(get_curse_name)
description.rename(columns={'instructor_url':'instructor_username'},inplace=True)
description.rename(columns={'course_url':'course_name'},inplace=True)
english_dataframe = description[description['language'] == 'English' ] 

In [138]:

# group by function comparisson

#  toCompare | pandas | koalas | pyspark.pandas | pyspark |
#            |--------|--------|----------------|---------|
# time       |18s     |2s      |0.03s           |0.009s    |
# convenient |yes     |1/2     |yes             |no       |

In [139]:
prev = datetime.datetime.now()
instructors_df = english_dataframe.groupby(['instructor_username','instructor_name']).agg({'num_subscribers':'mean','avg_rating':'mean', 'price':'mean'})
now = datetime.datetime.now()
print(now - prev)

0:00:00.025270


In [140]:
instructors_df.reset_index(inplace = True)
instructors_df = instructors_df.rename(columns={'instructor_username':'username','instructor_name':'name','num_subscribers':'avg_num_subscribers','price':'avg_price'})
instructors_df = instructors_df.set_index('username')

In [141]:
# user defined functions applied to a column in the dataframe

#  toCompare | pandas | koalas | pyspark.pandas | pyspark |
#            |--------|--------|----------------|---------|
# time       |18s     |5s      |5s              |1s       |
# convenient |yes     |1/2     |yes             |no       |

In [142]:
prev = datetime.datetime.now()
instructors_df['name']= instructors_df['name'].map(getFirstName)
now = datetime.datetime.now()
print(now - prev)



0:00:05.361891


                                                                                

In [143]:
import gender_guesser.detector as gender
gd = gender.Detector()

In [144]:
instructors_df['gender'] = instructors_df['name'].map(lambda x: gd.get_gender(x.capitalize()))



                                                                                

In [145]:
# se tendria que hacer una labor de identificarlos a mano o encontrar un diccionario en linea que contenga nombres de indios
gbyname.head()





                                                                                

Unnamed: 0_level_0,count
name,Unnamed: 1_level_1
abhishek,55
gaurav,40
learn,37
tech,36
md,33


In [146]:
known_instructors = instructors_df[instructors_df['gender'] != 'unknown']

In [147]:
# the most succesfull according to sex
avg_num_subscribers_per_gender = known_instructors.groupby('gender').agg({'avg_num_subscribers':'mean'})
avg_num_subscribers_per_gender.reset_index(inplace=True)

In [148]:
avg_num_subscribers_per_gender.plot.bar(x='gender',y='avg_num_subscribers')





                                                                                

In [149]:
avg_rating_per_gender = known_instructors.groupby('gender').agg({'avg_rating':'mean'})
avg_rating_per_gender.reset_index(inplace=True)

In [150]:
# set the x range to be from 3 to 5
avg_rating_per_gender.plot.bar(x='gender',y = 'avg_rating')





                                                                                

In [151]:
# now lets build a recommender system

In [152]:
import nltk

In [153]:
english_dataframe['name']= english_dataframe['instructor_name'].map(getFirstName)
english_dataframe['gender'] = english_dataframe['name'].map(lambda x: gd.get_gender(x.capitalize()))
english_dataframe = english_dataframe[english_dataframe['gender'] != 'unknown']
english_dataframe.drop(columns = ['id','num_comments','num_lectures','content_length_min','instructor_username','instructor_name'])
english_dataframe.drop(columns = ['language','course_name','published_time'] )



Unnamed: 0,id,title,is_paid,price,headline,num_subscribers,avg_rating,num_reviews,num_comments,num_lectures,content_length_min,last_update_date,category,subcategory,topic,instructor_name,instructor_username,name,gender
0,4715.0,Online Vegan Vegetarian Cooking School,True,24.99,Learn to cook delicious vegan recipes. Filmed ...,2231,3.75,134,42,37,1268.0,2020-11-06,Lifestyle,Food & Beverage,Vegan Cooking,Angela Poch,angelapoc,angela,female
1,1769.0,The Lean Startup Talk at Stanford E-Corner,False,0.0,"""Debunking Myths of Entrepreneurship A startup...",26474,4.5,709,112,9,88.0,,Business,Entrepreneurship,Lean Startup,Eric Ries,ericrie,eric,male
2,5664.0,"How To Become a Vegan, Vegetarian, or Flexitarian",True,19.99,Get the tools you need for a lifestyle change ...,1713,4.4,41,13,14,82.0,2019-10-09,Lifestyle,Other Lifestyle,Vegan Cooking,Angela Poch,angelapoc,angela,female
3,7723.0,How to Train a Puppy,True,199.99,Train your puppy the right way with Dr. Ian Du...,4988,4.8,395,88,36,1511.0,2016-01-13,Lifestyle,Pet Care & Training,Pet Training,Ian Dunbar,ian-dunba,ian,male
5,8139.0,14-Day Yoga Detox and Empowerment Course,True,29.99,"Lose weight, get healthier and fit on all leve...",20505,4.5301204,796,135,31,1163.0,2018-05-22,Health & Fitness,Yoga,Yoga,Sadie Nardini,sadienardin,sadie,female
6,2762.0,Simple Strategy for Swing Trading the Stock Ma...,True,39.99,Use my favorite Technical Indicator and the Tr...,3309,3.85,958,241,8,80.0,2019-03-07,Finance & Accounting,Investing & Trading,Swing Trading,Tom Watson,tomwatso,tom,male
7,8082.0,Ruby Programming for Beginners,True,74.99,Learn Ruby Programming the fast and easy way!,28824,4.0,741,189,56,363.0,2022-09-26,Development,Programming Languages,Ruby,Huw Collingbourne,huwcollingbourn,huw,male
8,8075.0,How to Create an Awesome Demo Video for Your B...,True,149.99,You don't need to spend $10K in order to have ...,10761,3.9,349,101,87,526.0,2020-11-22,Business,Media,Demo Video,Miguel Hernandez,miguelhernande,miguel,male
13,8420.0,CCNP ROUTE 300-101 Video Boot Camp With Chris ...,True,19.99,Pass The CCNP ROUTE 300-101 Exam With Chris Br...,4454,4.35,829,147,230,1813.0,2017-10-21,IT & Software,IT Certifications,CCNP Enterprise,Chris Bryant,chrisbryan,chris,mostly_male
15,8467.0,The Lean Startup,True,39.99,Learn how to apply the method that is transfor...,5566,4.1666665,720,163,6,158.0,,Business,Entrepreneurship,Lean Startup,Eric Ries,ericrie,eric,male


In [154]:
english_dataframe.head()



Unnamed: 0,id,title,is_paid,price,headline,num_subscribers,avg_rating,num_reviews,num_comments,num_lectures,content_length_min,published_time,last_update_date,category,subcategory,topic,language,course_name,instructor_name,instructor_username,name,gender
0,4715.0,Online Vegan Vegetarian Cooking School,True,24.99,Learn to cook delicious vegan recipes. Filmed ...,2231,3.75,134,42,37,1268.0,2010-08-05T22:06:13Z,2020-11-06,Lifestyle,Food & Beverage,Vegan Cooking,English,vegan-vegetarian-cooking-school,Angela Poch,angelapoc,angela,female
1,1769.0,The Lean Startup Talk at Stanford E-Corner,False,0.0,"""Debunking Myths of Entrepreneurship A startup...",26474,4.5,709,112,9,88.0,2010-01-12T18:09:46Z,,Business,Entrepreneurship,Lean Startup,English,the-lean-startup-debunking-myths-of-entreprene...,Eric Ries,ericrie,eric,male
2,5664.0,"How To Become a Vegan, Vegetarian, or Flexitarian",True,19.99,Get the tools you need for a lifestyle change ...,1713,4.4,41,13,14,82.0,2010-10-13T18:07:17Z,2019-10-09,Lifestyle,Other Lifestyle,Vegan Cooking,English,see-my-personal-motivation-for-becoming-vegeta...,Angela Poch,angelapoc,angela,female
3,7723.0,How to Train a Puppy,True,199.99,Train your puppy the right way with Dr. Ian Du...,4988,4.8,395,88,36,1511.0,2011-06-20T20:08:38Z,2016-01-13,Lifestyle,Pet Care & Training,Pet Training,English,complete-dunbar-collection,Ian Dunbar,ian-dunba,ian,male
5,8139.0,14-Day Yoga Detox and Empowerment Course,True,29.99,"Lose weight, get healthier and fit on all leve...",20505,4.5301204,796,135,31,1163.0,2011-07-15T04:13:24Z,2018-05-22,Health & Fitness,Yoga,Yoga,English,yoga-for-weight-loss-and-core-strength-with-sa...,Sadie Nardini,sadienardin,sadie,female


In [155]:
def recommenderSystem(dataframe,category = 'none',subcategory = "none"):
	courses = dataframe
	dataframe.head()
	print(len(courses))
	courses = dataframe

	if category is not 'none':
		courses = courses[courses['category'] == category]

	if subcategory is not 'none':
		courses = courses[courses['subcategory'] == subcategory]

	print(len(courses))

	rating = 4.9
	sumln = 0
	first = False

	while sumln < 10:
		tmp_courses = courses[courses['avg_rating'] > rating]
		if not first:
			first = True
			sumln += len(tmp_courses)
			ans = tmp_courses
		else:
			sumln += len(tmp_courses)
			pd.concat([ans,tmp_courses])
		rating -= 0.1

	ans = ans.sort_values(by='num_subscribers',ascending=False)
	return ans.head(10)

recomendation = recommenderSystem(english_dataframe,category='Lifestyle',subcategory='Food & Beverage')

                                                                                

76495




530


                                                                                

In [156]:
recomendation.head()





                                                                                

Unnamed: 0,id,title,is_paid,price,headline,num_subscribers,avg_rating,num_reviews,num_comments,num_lectures,content_length_min,published_time,last_update_date,category,subcategory,topic,language,course_name,instructor_name,instructor_username,name,gender
121286,3612856.0,"Complete Sourdough Bread Baking - Levels 1, 2,...",True,199.99,Sourdough Bread Baking from Beginner to Advanc...,20503,4.95,152,50,105,288.0,2020-12-26T23:36:54Z,2021-10-27,Lifestyle,Food & Beverage,Sourdough Bread Baking,English,complete-sourdough-bread-baking-levels-1-2-3-a...,Daniel Mazz,danielmaz,daniel,male
131679,3791122.0,HOME Bartending: Make Exquisite Cocktails for ...,True,124.99,Learn 40+ Cocktail Recipes and Organize The Mo...,2076,4.95,72,16,56,242.0,2021-03-02T09:29:47Z,2021-12-22,Lifestyle,Food & Beverage,Bartending,English,home-bartending-make-exquisite-cocktails-for-y...,Alex Glod,alexglo,alex,male
8563,469860.0,Cooking Eggs: The Best Recipes for Cheap & Hea...,True,39.99,All the different ways to cook this common foo...,1981,4.95,68,22,38,100.0,2017-04-28T23:55:13Z,2022-01-10,Lifestyle,Food & Beverage,Cooking,English,cookthebesteggs,Jenna Edwards,jennaedwards,jenna,female
14895,735670.0,#6 Learn to Bake Magnificent Challah Bread,True,79.99,Baking Sourdough Challah or Traditional Challa...,1920,4.95,225,57,58,191.0,2016-03-18T21:06:45Z,2021-08-21,Lifestyle,Food & Beverage,Bread Baking,English,learn-to-bake-magnificent-challah,Teresa L Greenway,teresalgreenwa,teresa,female
90152,2931846.0,PASTEL DE NATA - Portuguese custard tarts for ...,True,29.99,EGG TARTS. Bake Portugal's most iconic pastry ...,1042,4.90625,274,80,11,47.0,2020-04-04T17:00:43Z,2021-11-18,Lifestyle,Food & Beverage,Baking,English,pasteldenata,João Batalha,joao-batalha-,joão,mostly_male
