In [196]:
import pymongo
import pandas as pd
from pymongo import MongoClient
import re
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.svm import SVR
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer 
from sklearn.metrics import r2_score
import nltk
import string
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('words')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Omneya
[nltk_data]     Essam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Omneya
[nltk_data]     Essam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Omneya
[nltk_data]     Essam\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Omneya
[nltk_data]     Essam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load databases into dataframes

In [5]:
client = MongoClient()
#point the client at mongo URI
client = MongoClient('localhost' , 27017)
#select database
db = client['coursedb']
#select the collection within the database
udemy = db.udemy
#convert entire collection to Pandas dataframe
udemyData = pd.DataFrame(list(udemy.find()))
# select the collection within the database
udacity = db.udacity
#convert entire collection to Pandas dataframe
udacityData = pd.DataFrame(list(udacity.find()))

In [101]:
udemyData.head(5)

Unnamed: 0,_id,title,id,price,link,description,instructors,source
0,614858f0ad02b3194a58be71,Automotive Engineering; Digital Powertrain Control Systems,4269646,$29.99,/course/automotive-engineering-digital-powertrain-control-systems/,A detailed description on the Role of electronics in Electronic Engine and powertrain,[Palem],udemy
1,614858f0ad02b3194a58be72,The Art of Sampling with Ableton,4249372,$59.99,/course/the-art-of-sampling-with-ableton/,Learn the Art of Sampling in Ableton Live & turn your computer into a Sampler,[Elevator],udemy
2,614858f0ad02b3194a58be73,Concrete and steel structures design,4250228,$89.99,/course/concrete-and-steel-structures-design/,"Learn to design reinforced concrete, steel bars and building steel structures","[AulaGEO, Felix Enzo]",udemy
3,614858f0ad02b3194a58be74,Complete Guide to Bootstrap 5 with 6 Real World Projects,4266924,$124.99,/course/complete-guide-to-bootstrap/,Master the latest version of bootstrap 5 from scratch by coding 5 CSS and 1 SASS real world websites.,[Sofiullah],udemy
4,614858f0ad02b3194a58be75,How to Heal Your Inner Child,4267576,$44.99,/course/how-to-heal-your-inner-child/,Jungian Archetype Discovery,[Patrisha],udemy


In [103]:
print(udacityData.shape)
print(udemyData.shape)

(242, 8)
(100, 8)


# Udacity data has no price column, we will impute it 

In [106]:
pricesPerLevel = { '' : '$25',
                'beginner' : '$75' ,
                 'intermediate' :'$100' ,
                 'advanced' : '$120'}
prices = []
for level in udacityData['level']:
    prices.append(pricesPerLevel[level])
udacityData['price'] = prices

In [107]:
udacityData.head(5)

Unnamed: 0,_id,title,instructors,level,description,prerequisites,price,source
0,614858b9c5d8755c5fb21255,Android Interview Prep,[Eric Gonzalez],intermediate,"This course is an excellent way to prepare for technical interviews. You’ll experience a mock interview, and review detailed analysis on how to field key industry questions. You’ll work through common Android Developer interview topics ranging from explicit and implicit intents to designing a min stack, and you’ll learn best practices for behavioral questions and whiteboard problems. Upon completing the course, you’ll be ready to showcase your skills during the Android Developer interview!",Java (intermediate),$100,udacity
1,614858b9c5d8755c5fb21256,Shell Workshop,[Karl Krueger],beginner,"<p>A quick, one-lesson introduction to the Unix-style command-line environment.</p><p>This course is intended to get you up to speed on the shell — using a terminal, managing files and directories, and running command-line programs.</p>","<p><strong>This is a course for beginners to the shell environment.</strong></p><p>To take this course, you&#39;ll need a current Mac, Windows, or Linux computer.</p><p>You should be familiar with elementary programming concepts such as &quot;statement&quot;, &quot;variable&quot;, &quot;argument to a function&quot;, and &quot;file&quot;. No particular programming language is required, though.</p>",$75,udacity
2,614858bac5d8755c5fb21257,iOS Interview Prep,[Jarrod Parkes],intermediate,"This course is an excellent way to prepare for technical interviews. You’ll experience a mock interview, and review detailed analysis on how to field key industry questions. You’ll review common iOS Developer interview topics ranging from notifications and delegates to the model view controller, and learn best practices for behavioral questions and whiteboard problems. You'll learn how to discuss your interests in iOS development. Senior mobile developers will share their tips for addressing common data issues. When you complete this course, you’ll be ready to showcase your skills during the iOS Developer interview!",Swift (intermediate),$100,udacity
3,614858bac5d8755c5fb21258,Refresh Your Resume,[Trinh Nguyen],beginner,"Your resume is the first step in the job search. Through this course, you will identify the companies you want to work for, and showcase your key achievements to align with their roles. With our proven industry framework, you will build a tech-focused resume that will help you get noticed and land the interview.",Proficient in spoken and written English,$75,udacity
4,614858bac5d8755c5fb21259,C++ For Programmers,[Catherine Gamboa],intermediate,"<p>C++ for Programmers is designed for students who are familiar with a programming language and wish to learn C++.</p><p>This course focuses on &#39;how&#39; as opposed to &#39;what&#39;. For example, in the lesson on functions, we do not teach what a function is, but rather how to create a function in C++.<br>The lessons are taught by several different instructors who have used C++ in their professional careers, so students get to experience different perspectives. </p><p>The course also includes comments and tips from Bjarne Stroustrup - the original designer of C++.</p>",<p>There is one prerequisite for this course: familiarity with another programming language.</p>,$100,udacity


# Concatenate both dataframes

In [199]:
df = pd.concat([udemyData, udacityData])

In [200]:
df

Unnamed: 0,_id,title,id,price,link,description,instructors,source,level,prerequisites
0,614858f0ad02b3194a58be71,Automotive Engineering; Digital Powertrain Control Systems,4269646.0,$29.99,/course/automotive-engineering-digital-powertrain-control-systems/,A detailed description on the Role of electronics in Electronic Engine and powertrain,[Palem],udemy,,
1,614858f0ad02b3194a58be72,The Art of Sampling with Ableton,4249372.0,$59.99,/course/the-art-of-sampling-with-ableton/,Learn the Art of Sampling in Ableton Live & turn your computer into a Sampler,[Elevator],udemy,,
2,614858f0ad02b3194a58be73,Concrete and steel structures design,4250228.0,$89.99,/course/concrete-and-steel-structures-design/,"Learn to design reinforced concrete, steel bars and building steel structures","[AulaGEO, Felix Enzo]",udemy,,
3,614858f0ad02b3194a58be74,Complete Guide to Bootstrap 5 with 6 Real World Projects,4266924.0,$124.99,/course/complete-guide-to-bootstrap/,Master the latest version of bootstrap 5 from scratch by coding 5 CSS and 1 SASS real world websites.,[Sofiullah],udemy,,
4,614858f0ad02b3194a58be75,How to Heal Your Inner Child,4267576.0,$44.99,/course/how-to-heal-your-inner-child/,Jungian Archetype Discovery,[Patrisha],udemy,,
5,614858f0ad02b3194a58be76,CERTIFIED DATA PRIVACY SOLUTIONS ENGINEER(CDPSE)Practice SET,4269764.0,$19.99,/course/certified-data-privacy-solutions-engineer-cdpse-practice-set-sp/,[LATEST 2021]Cert. Data Privacy Solutions Engineer (CDPSE) Practice Exams,[Pranshi],udemy,,
6,614858f0ad02b3194a58be77,How to Communicate Effectively As a New Manager?,4283106.0,$39.99,/course/how-to-communicate-effectively-as-a-new-manager/,"Master The Tips and Speech to Succeed as a Leader. Learn Communication, Management, and Leadership Skills!",[Dr. Ujjwal Bikram],udemy,,
7,614858f0ad02b3194a58be78,Advanced Conversational Hypnosis: Emotional Trance Method,4210650.0,$19.99,/course/advanced-conversational-hypnosis-emotional-trance-method/,Hypnotise and heal even the most sceptical and resistant clients within minutes - without them knowing what you're doing,[Scott],udemy,,
8,614858f0ad02b3194a58be79,[NEW] Microsof pl-400: Power Platform pl-400 Exam Test,4212580.0,$19.99,/course/microsoft-pl-400-power-platform-pl-400-exam-preparation/,Microsoft Power Platform Functional Consultant (PL-400) Exam with 75+ Q&A's - Nishant Chaudhary,[Nishant],udemy,,
9,614858f0ad02b3194a58be7a,C_TSCM42_67 SAP Certified Application Associate for ECC 6.0,4212620.0,$19.99,/course/c_tscm42_67-sap-certified-application-associate-pp-ecc-6/,C_TSCM42_67 SAP Certified Application Associate – Production Planning & Manufacturing with SAP ERP 6.0 EHP7 Q&A series,[Praveen],udemy,,


In [201]:
df.isna().sum()

_id              0  
title            0  
id               242
price            0  
link             242
description      0  
instructors      31 
source           0  
level            100
prerequisites    100
dtype: int64

# Drop unuseful columns

In [202]:
df.drop(['id','_id','link','prerequisites','instructors','description'], inplace = True , axis = 1)

In [203]:
df

Unnamed: 0,title,price,source,level
0,Automotive Engineering; Digital Powertrain Control Systems,$29.99,udemy,
1,The Art of Sampling with Ableton,$59.99,udemy,
2,Concrete and steel structures design,$89.99,udemy,
3,Complete Guide to Bootstrap 5 with 6 Real World Projects,$124.99,udemy,
4,How to Heal Your Inner Child,$44.99,udemy,
5,CERTIFIED DATA PRIVACY SOLUTIONS ENGINEER(CDPSE)Practice SET,$19.99,udemy,
6,How to Communicate Effectively As a New Manager?,$39.99,udemy,
7,Advanced Conversational Hypnosis: Emotional Trance Method,$19.99,udemy,
8,[NEW] Microsof pl-400: Power Platform pl-400 Exam Test,$19.99,udemy,
9,C_TSCM42_67 SAP Certified Application Associate for ECC 6.0,$19.99,udemy,


In [213]:
pd.set_option("display.max_rows", None, "display.max_columns", None,'display.max_colwidth', -1)

  pd.set_option("display.max_rows", None, "display.max_columns", None,'display.max_colwidth', -1)


# Label encoding of ''source''  and "level"column

In [204]:
df['source'] = df['source'].replace({'udemy':1, 'udacity':2})
df

Unnamed: 0,title,price,source,level
0,Automotive Engineering; Digital Powertrain Control Systems,$29.99,1,
1,The Art of Sampling with Ableton,$59.99,1,
2,Concrete and steel structures design,$89.99,1,
3,Complete Guide to Bootstrap 5 with 6 Real World Projects,$124.99,1,
4,How to Heal Your Inner Child,$44.99,1,
5,CERTIFIED DATA PRIVACY SOLUTIONS ENGINEER(CDPSE)Practice SET,$19.99,1,
6,How to Communicate Effectively As a New Manager?,$39.99,1,
7,Advanced Conversational Hypnosis: Emotional Trance Method,$19.99,1,
8,[NEW] Microsof pl-400: Power Platform pl-400 Exam Test,$19.99,1,
9,C_TSCM42_67 SAP Certified Application Associate for ECC 6.0,$19.99,1,


In [205]:
df['level'] = df['level'].fillna(0)
df

Unnamed: 0,title,price,source,level
0,Automotive Engineering; Digital Powertrain Control Systems,$29.99,1,0
1,The Art of Sampling with Ableton,$59.99,1,0
2,Concrete and steel structures design,$89.99,1,0
3,Complete Guide to Bootstrap 5 with 6 Real World Projects,$124.99,1,0
4,How to Heal Your Inner Child,$44.99,1,0
5,CERTIFIED DATA PRIVACY SOLUTIONS ENGINEER(CDPSE)Practice SET,$19.99,1,0
6,How to Communicate Effectively As a New Manager?,$39.99,1,0
7,Advanced Conversational Hypnosis: Emotional Trance Method,$19.99,1,0
8,[NEW] Microsof pl-400: Power Platform pl-400 Exam Test,$19.99,1,0
9,C_TSCM42_67 SAP Certified Application Associate for ECC 6.0,$19.99,1,0


In [206]:
df['level'] = df['level'].replace({0:0, 'beginner':1, 'intermediate':2, 'advanced':3,"":4})
df

Unnamed: 0,title,price,source,level
0,Automotive Engineering; Digital Powertrain Control Systems,$29.99,1,0
1,The Art of Sampling with Ableton,$59.99,1,0
2,Concrete and steel structures design,$89.99,1,0
3,Complete Guide to Bootstrap 5 with 6 Real World Projects,$124.99,1,0
4,How to Heal Your Inner Child,$44.99,1,0
5,CERTIFIED DATA PRIVACY SOLUTIONS ENGINEER(CDPSE)Practice SET,$19.99,1,0
6,How to Communicate Effectively As a New Manager?,$39.99,1,0
7,Advanced Conversational Hypnosis: Emotional Trance Method,$19.99,1,0
8,[NEW] Microsof pl-400: Power Platform pl-400 Exam Test,$19.99,1,0
9,C_TSCM42_67 SAP Certified Application Associate for ECC 6.0,$19.99,1,0


In [207]:
df['level'].unique()

array([0, 2, 1, 3, 4], dtype=int64)

In [208]:
df.isna().sum()

title     0
price     0
source    0
level     0
dtype: int64

In [210]:
'''
this block of code was to clean text in title column, in order to perform some nlp model
'''
stopwords = nltk.corpus.stopwords.words('english')

df['title']= df['title'].str.replace(r'<[^<>]*>', '')
df['title']= df['title'].str.replace(r':','')
df['title']= df['title'].str.replace('[^a-zA-Z]',' ')
df['price'] = df['price'].str.replace(r'$','')
df['price'] = df['price'].astype(float)
df['title']= df['title'].str.replace('[{}]'.format(string.punctuation), '')
df['title']= df['title'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stopwords)]))

# Model 

In [192]:
X = df[['source','level']] 
#the column text contains textual data to extract features from.
y = df['price']
#this is the column we are learning to predict.
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1)

In [212]:
'''
#Step 2-3: Pre-process and Vectorize train and test data
vect = CountVectorizer(max_features=500) 
#clean is a function we defined for pre-processing, seen in the notebook.
X_train_dtm = vect.fit_transform(X_train)
X_test_dtm = vect.transform(X_test)
print(X_train_dtm.shape, X_test_dtm.shape)
'''

'\n#Step 2-3: Pre-process and Vectorize train and test data\nvect = CountVectorizer(max_features=500) \n#clean is a function we defined for pre-processing, seen in the notebook.\nX_train_dtm = vect.fit_transform(X_train)\nX_test_dtm = vect.transform(X_test)\nprint(X_train_dtm.shape, X_test_dtm.shape)\n'

In [194]:
models = []

models.append(("SVR",SVR()))
models.append(("RandomForest",RandomForestRegressor()))

In [195]:
accuracy = []
names = []
for name,model in models:
    model.fit(X_train, y_train)
    y_pred_class = model.predict(X_test)    
    names.append(name)
    accuracy.append(r2_score(y_test, y_pred_class))

for i in range(len(names)):
    print("{} accuracy = {:.3f}".format(names[i],accuracy[i]))

SVR accuracy = 0.730
RandomForest accuracy = 0.829
