In [316]:
import requests
from urllib.request import urlopen as uReq 
import urllib.error 
from bs4 import BeautifulSoup as soup
import pandas as pd
import re # regex
import numpy as np



# Making a course object
## Every course has a name, description, course provider and an institution its taught by. These are all strings. Also a url. 
## Every course has a pandas DF of related Courses, a pandas DF of reviews, and a pandas DF of attributes(aka tags)
## the pandas DF of related Courses has the following columns: [course_name,Insitution, MOOC]
## The pandas DF of reviews has the following columns: [num_rating,review_text, completion_status, difficulty]
num rating and difficulty are from 0-4 numerical ratings (0 = 1 star, 1= 2 stars, and so on.) (difficulty: 0=v.easy, 1=easy, 2=medium, etc).
Review text is a string of the entire review (god bless your memory)
completion status is a binary variable(completed/in progress)



In [374]:
class Course:
    def __init__(self,name, relatedCourses=None, description=None, reviews=None,provider=None, attributes=None,url=None, institution=None,numAdditionalInfo=0):
        # a bunch of instance vars lol
        self.name = name
        self.relatedCourses = relatedCourses
        self.description = description
        self.reviews = reviews
        self.institution = institution
        self.provider = provider
        self.attributes = attributes
        self.url = url
        
        self.numAdditionalInfo=numAdditionalInfo #added this later on, its the number of people who added additonal info on their website.
        
    # getters and setters
    def getNumAdditionalInfo(self):
        return self.numAdditionalInfo
    def getName(self):
        return self.name
    def getRelatedCourses(self):
        return self.relatedCourses
    def getDescription(self):
        return self.description
    def getReviews(self):
        return self.reviews
    def getCourseProvider(self):
        return self.provider
    def getAttrs(self):
        return self.attributes
    def getUrl(self):
        return self.url
    def setName(self,newName):
        self.name=newName
        return
    def setNumAdditionalInfo(self,newnumAdditionalInfo):
        self.numAdditionalInfo = newnumAdditionalInfo
        return
    def setRelatedCourses(self,newRelatedCourses):
        self.relatedCourses=newRelatedCourses
        return
    def setDescription(self,newDescript):
        self.description = newDescript
        return
    def setReviews(self,newReviews):
        self.reviews =newReviews
        return
    def setinstitution(self,newinstitution):
        self.institution = newinstitution
        return
    def setProvider(self,newprovider):
        self.provider = newprovider
        return
    def setAttrs(self,newAttrs):
        self.attributes = newAttrs
        return
    def setUrl(self,newUrl):
        self.url = newUrl
        return
    def grabHTML(self,url):
        uClient = uReq(url)
        page_html = uClient.read()
        uClient.close()
        html = soup(page_html,'html.parser')
#        except urllib.URLError:
#            html = None
#            print("URLError: ",url)
#        except urllib.gaierror:
#            html = None
#            print('bad link: ', url)
        
        return html
    def getReviewText(self,review):
        review = review.findAll('div',{'class':'review-content text-2 margin-vert-small'})
        return review[0].text.strip()
    def getRating(self,review):
        review = review.findAll('span',{'class':'review-rating medium-up-hidden text--charcoal'})
        return review[0].text.strip()
    def getStatus(self,review):
        review = review.findAll('span',{'class':'text--italic'})[0].text.strip()
        if review == 'completed this course.':
            return 1
        else:
            self/setNumAdditionalInfo((getNumAdditionalInfo()+1)) # theres additonal info
            return 0
    def formBaseUrl(self,url):
        return url+"?start="
        
    def editUrl(self,url,step):
        url= self.formBaseUrl(self.getUrl())
        url =url+str(step)
        return url
    def getAdditionalInfo(self,containers):
        containers = containers.findAll('div',{'id':'reviews-items'})

        rows =containers[0].findAll('div',{'class':'review-title title-with-image margin-top-xsmall text-2'})
        difficultyText = ['very easy','easy','medium','hard','very hard']

        output = []
        for i in range(len(rows)):
            rowInfo =[]
            if 'completed this course' in rows[i].text.strip():
                rowInfo.append(1)
            else:
                rowInfo.append(0)
            if 'spending' in rows[i].text.strip():
                #get hours
                hours = int([(inte,pos) for inte, pos in enumerate(rows[i].text.strip()) if pos.isdigit()][0][1])
                rowInfo.append(hours)
                # get difficulty
                dif = [re.search(j, rows[i].text.strip())== None for j in difficultyText]
                rowInfo.append([j for j in range(len(dif)) if dif[j] == False][0])
            else:
                rowInfo.extend([None,None])
            output.append(rowInfo)
        return output  
    def reviewFilter(self,soup): #returns a list of reviews 
        #print(soup)
        soup = soup.findAll('div',{'id':'reviews-items'})
        reviewList = soup[0].findAll('div',{'class':'border-all border--gray-xlight radius padding-large single-review margin-top-medium margin-bottom-large'})
        return reviewList
    def getNumberOfReviews(self,soup):
        x = soup.findAll('a',{'id':'read-reviews'})
        x= x[0].findAll('span',{'class':'text--underline inline-block padding-right-xxsmall'})
        x = int(re.findall('\d+',x[0].text.strip())[0])
        return x
       # (re.findall('\d+', str1 ))
    def processReviews(self, listReviewSoup, reviewDF,page_soup):
        reviewDF = pd.DataFrame(index=reviewDF.index, columns=reviewDF.columns)
        for idx in range(len(listReviewSoup)):
            review = listReviewSoup[idx]
            additionalInfo = self.getAdditionalInfo(page_soup)
            #make dataframe to append
            appender = pd.Series([self.getReviewText(review),self.getRating(review),additionalInfo[idx][0],additionalInfo[idx][2],additionalInfo[idx][1]],
                        index=[reviewDF.columns])
            #print(appender.shape)
            reviewDF.iloc[idx,:] = appender.values
        self.setReviews(reviewDF)
        return reviewDF
    def updateReviews(self):
        columns = ['reviewText','reviewRating','completionStatus','hoursWeekly','difficulty']#refactor to read in the csv headers so we dont have to do this every time
      
        page_soup= self.grabHTML(self.getUrl())
        #sanity check
        #print(page_soup)
        numReviews = range(self.getNumberOfReviews(page_soup))
        #sanity check:  print(numReviews)
        reviewDF = pd.DataFrame(columns=columns,index=(numReviews))
        revHTML = self.reviewFilter(page_soup)
        print(len(revHTML))
        
        if len(numReviews) > 20:
            #url = self.formBaseUrl(self.getUrl())+str('0')
            extensions=  [i for i in numReviews if i%20==0]
            #print(extensions)
            multiDataFrames =[]
            for rev in extensions:
                url = self.editUrl(self.getUrl(),rev)

                print(url)
                html = self.grabHTML(url)
                review = self.reviewFilter(html)
                df= self.processReviews(review,reviewDF,page_soup)
                #print(df)
                
                multiDataFrames.append(df)
                if len(multiDataFrames) > 1:
                    multiDataFrames=[multiDataFrames[0].append(multiDataFrames[1])]
                    print(multiDataFrames)
            almost =multiDataFrames[0]
            done = almos(t.dropna(subset=['reviewText'])
            done.index= numReviews
            self.setReviews(done)
        else:
            self.processReviews(revHTML,reviewDF,page_soup)
        # now to get specific columns
        
        
        

In [375]:
trial = Course('dino')
trial.setUrl('https://www.class-central.com/course/kadenze-creative-applications-of-deep-learning-with-tensorflow-6679')
trial.updateReviews()
#trial.getReviews()

20
https://www.class-central.com/course/kadenze-creative-applications-of-deep-learning-with-tensorflow-6679?start=0
https://www.class-central.com/course/kadenze-creative-applications-of-deep-learning-with-tensorflow-6679?start=20
[                                           reviewText reviewRating  \
0   This is a great course -- the approach is quit...          5.0   
1   Very high quality materials and video. with in...          3.0   
2   Fun and insightful combination of learning Ten...          4.0   
3   I have an undergraduate degree in computer sci...          5.0   
4   The instructor seems very active on the forums...          5.0   
5   I think  this course is excellent and inspirin...          5.0   
6   This is a very hard course! But this is also a...          5.0   
7   Fantastic course, moves very fast, huge learni...          5.0   
8   This course has decent content, but the pedago...          2.0   
9   This course is really great to learn TensorFlo...          5.0   

In [377]:
x=trial.getReviews()
x

Unnamed: 0,reviewText,reviewRating,completionStatus,hoursWeekly,difficulty
0,This is a great course -- the approach is quit...,5.0,1,,
1,Very high quality materials and video. with in...,3.0,1,,
2,Fun and insightful combination of learning Ten...,4.0,0,,
3,I have an undergraduate degree in computer sci...,5.0,0,3.0,1.0
4,The instructor seems very active on the forums...,5.0,0,,
5,I think this course is excellent and inspirin...,5.0,0,,
6,This is a very hard course! But this is also a...,5.0,1,,
7,"Fantastic course, moves very fast, huge learni...",5.0,1,,
8,"This course has decent content, but the pedago...",2.0,0,2.0,1.0
9,This course is really great to learn TensorFlo...,5.0,1,,
