# Import packages 

In [2]:
from selenium import webdriver
from bs4 import BeautifulSoup
import requests
import time 
import os
import pandas as pd

chromedriver = "/Applications/chromedriver"
os.environ["webdriver.chrome.driver"] = chromedriver

In [3]:
url = 'https://www.udemy.com/courses/design/'
page = requests.get(url).text
soup = BeautifulSoup(page, 'lxml')
driver = webdriver.Chrome(executable_path=chromedriver)
driver.get(url)

# Perform web Scraping 

In [4]:
def extract_record(item):
    try :
        title = item.find('div',{'class':'udlite-focus-visible-target udlite-heading-md course-card--course-title--vVEjC'}).text
    except :
        title = None
    try :
        description = item.find('p',{'class':'udlite-text-sm course-card--course-headline--2DAqq'}).text
    except :
        description = None
    try :
        price = item.find('div', {'class':'price-text--price-part--2npPm course-card--discount-price--1bQ5Q udlite-heading-md'}).text 
    except :
        price = None
    try :
        rating = item.find('span',{'class':'star-rating--star-wrapper--1QyBg'}).find('span').text
    except :
        rating = None
    try :
        reviews = item.find('span',{'class':'udlite-text-xs course-card--reviews-text--1yloi'}).text
    except :
        reviews = None
    
    if item.find('s') == None :
        discount = None
    else :
        discount = item.find('s').find('span').text
    try :
        trainer = item.find('div',{'class':'udlite-text-xs course-card--instructor-list--nH1OC'}).text
    except :
        trainer = None
    course_info = item.find('div',{'class':'udlite-text-xs course-card--row--29Y0w course-card--course-meta-info--2jTzN'}).find_all('span')
    
    if len(course_info) == 3 :
        total_hours = course_info[0].text
        total_lectures = course_info[1].text
        level = course_info[2].text
    else :
        total_hours = None
        total_lectures = None
        try:
            level = course_info[1].text
        except :
            level = None
    
    result = (title,description,price,rating,reviews,discount,trainer,total_hours,total_lectures,level) 
    
    
    return result

In [5]:
def get_data() :
    records=[]

    for i in range(1,555):
        url = 'https://www.udemy.com/courses/design/?p='+str(i)

        print(url)
        driver.get(url)
        time.sleep(10)
        soup = BeautifulSoup(driver.page_source, "html5lib")

       
        div = soup.find(class_='course-list--container--3zXPS')
        results = div.find_all('a',{'class':'udlite-custom-focus-visible browse-course-card--link--3KIkQ'})
        
        
        
        for item in results:
            records.append(extract_record(item))
        time.sleep(5)
        
    return records
    

In [6]:
records = get_data()

https://www.udemy.com/courses/design/?p=1


AttributeError: 'NoneType' object has no attribute 'find_all'

# Definig the data frame 

In [None]:
df = pd.DataFrame(records, columns=['title','description','price','rating','reviews',
                                    'discount','trainer','total_hours','total_lectures',
                                    'level'])

In [None]:
df.info()

In [None]:
df.to_csv('udemy_development_data_9k.csv')

# Data cleaning 

In [None]:
df.columns

In [None]:
df.drop(columns="Unnamed: 0",inplace=True)

In [None]:
df.info()

In [None]:
df.dropna(subset =['description','total_hours','total_lectures'], inplace=True)

In [None]:
df.info()

In [None]:
df.trainer = df.trainer.str.replace(',','')

In [None]:
df.trainer = df.trainer.str.split().str[0] +' '+ df.trainer.str.split().str[1]

In [None]:
import re
def find_number(text):
    num = re.findall(r'[0-9]+',text)
    return "".join(num)
def find_rating(text):
    num = re.findall(r'[0-9]+',text)
    num.pop()
    return ".".join(num)

In [None]:
df.reviews = df['reviews'].apply(lambda x: find_number(x)).astype(int)

In [None]:
df.rating = df['rating'].apply(lambda x: find_rating(x)).astype(float)

In [None]:
df.total_hours = df['total_hours'].str.extract(r'(\d+.\d*)').astype('float')

In [None]:
df.discount = df['discount'].str.extract(r'(\d+.\d*)').astype('float')

In [None]:
df.total_lectures = df['total_lectures'].apply(lambda x: find_number(x)).astype(int)

In [None]:
df.price = df['price'].str.extract(r'(\d+.\d*)').astype('float')

In [None]:
df.level.unique()

In [None]:
df.to_csv('cleaned_udemy_data_9k.csv')

In [None]:
df.info()

In [None]:
df['price'].fillna((df['price'].mean()), inplace=True)

In [None]:
df['discount'].fillna((df['discount'].mean()), inplace=True)

In [None]:
df.info()

In [None]:
df.to_csv('cleaned_udemy_data_9k_v3.csv')

# Visualization

In [None]:
import seaborn as sns

In [None]:
sns.pairplot(df)