<b>Udemy, founded in May 2010, is an American online learning platform aimed at professional adults and students. As of Jan 2020, the platform has more than 50 million students and 57,000 instructors teaching courses in over 65 languages
Students take courses largely as a means of improving job-related skills. Some courses generate credit toward technical certification. Udemy has made a special effort to attract corporate trainers seeking to create coursework for employees of their company. As of 2020, there are more than 150,000 courses on the website.
</b>

<b> This dataset contains 3.682 records of courses from 4 subjects (Business Finance, Graphic Design, Musical Instruments and Web Design) taken from Udemy.
Udemy is a massive online open course (MOOC) platform that offers both free and paid courses. Anybody can create a course, a business model by which allowed Udemy to have hundreds of thousands of courses.</b>

<img src= "https://www.freelancinggig.com/blog/wp-content/uploads/2018/10/Programming-Language-used-for-Udemy.jpg">

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
from math import sqrt
from matplotlib import pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.io as pio 
pio.templates.default = "plotly_white"

#


from scipy import interp
import math
from scipy.stats import norm
from scipy import stats

#

import warnings
warnings.filterwarnings('ignore')
pd.options.display.max_columns = 50

#

from sklearn.model_selection import StratifiedShuffleSplit, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, mean_absolute_error,make_scorer,r2_score
from sklearn.inspection import plot_partial_dependence
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split,cross_val_score

from sklearn.linear_model import Lasso, Ridge, SGDRegressor,LinearRegression,RidgeCV
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.preprocessing import LabelEncoder,OneHotEncoder,StandardScaler,MinMaxScaler
from sklearn.svm import SVR
from sklearn.pipeline import Pipeline
from xgboost import XGBRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import ExtraTreeRegressor,DecisionTreeRegressor
from sklearn.neural_network import MLPRegressor
import xgboost as xgb
import lightgbm as lgb


In [None]:
data = pd.read_csv('/kaggle/input/udemy-courses/udemy_courses.csv',parse_dates=True)
data['published_timestamp']=pd.to_datetime(data['published_timestamp']).dt.date
data['year']=pd.to_datetime(data['published_timestamp']).dt.year
data['month']=pd.to_datetime(data['published_timestamp']).dt.day
data['day']=pd.to_datetime(data['published_timestamp']).dt.month


In [None]:
data.head()

## Phase I - EDA
* Univariate Analysis
* Bivariate Analysis

In [None]:
data.info()

In [None]:
data.describe()

In [None]:
from pandas_profiling import ProfileReport 
report = ProfileReport(data)
report

In [None]:
def missing_percentage(df):
    """This function takes a DataFrame(df) as input and returns two columns, total missing values and total missing values percentage"""
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100,2)
    return pd.concat([total, percent], axis=1, keys=['Total','Percent'])


missing_percentage(data)

In [None]:
sns.pairplot(data, hue="is_paid")

In [None]:
sns.set(style="darkgrid")
plt.subplots(figsize = (15,8))
ax = sns.barplot(x = "subject", 
                 y = "price", 
                 data=data, 
                linewidth=5
                )

plt.title("Price Distribution Across Courses offered", fontsize = 25,loc = 'center', pad = 40)
plt.ylabel("Price", fontsize = 15, )
plt.xlabel("subject",fontsize = 15);

In [None]:
fig = plt.figure(figsize=(15,8),)
ax=sns.kdeplot(data.loc[(data['subject'] == 'Business Finance'),'price'] , color='gray',shade=True,label='Business Finance')
ax=sns.kdeplot(data.loc[(data['subject'] == 'Graphic Design'),'price'] , color='g',shade=True, label='Graphic Design')
ax=sns.kdeplot(data.loc[(data['subject'] == 'Musical Instruments'),'price'] , color='red',shade=True,label='Musical Instruments')
ax=sns.kdeplot(data.loc[(data['subject'] == 'Web Development'),'price'] , color='green',shade=True,label='Web Development')
plt.title('Price Distribution', fontsize = 25, pad = 40)
plt.ylabel("Frequency of Price", fontsize = 15, labelpad = 20)
plt.xlabel("Price", fontsize = 15, labelpad = 20);

In [None]:
plt.subplots(figsize = (22,10),)
sns.distplot(data.price, bins = 100, kde = True, rug = False, norm_hist=False);

In [None]:
data.head()

In [None]:
g = sns.FacetGrid(data, size=5,hue="is_paid", col ="level", margin_titles=True)
g.map(plt.scatter, "price", "num_subscribers",edgecolor="w").add_legend()
g.fig.suptitle("Price by is_paid", size = 25)
plt.subplots_adjust(top=0.85)

In [None]:
fig = px.bar(data, x="subject", y="num_subscribers", color="is_paid",hover_name='subject')
fig.show()

In [None]:
df_sub = data.is_paid.value_counts().reset_index()
df_sub.columns = ['is_paid', 'Counts']
fig = px.bar(df_sub, x="is_paid", y="Counts", color='is_paid', barmode='group',
             height=400)
fig.show()

In [None]:
df_sub = data.subject.value_counts().reset_index()
df_sub.columns = ['subject', 'Counts']
fig = px.pie(df_sub, names='subject', values='Counts', width=500)
fig.update_layout(title="Courses offered")

In [None]:
gd = data.groupby(["level"])[["price"]].mean().reset_index()

fig = px.pie(gd,
             values="price",
             names="level",
             template="seaborn")
fig.update_traces(rotation=90, pull=0.05, textinfo="percent+label")
fig.show()

### Top Paid Courses

In [None]:
paid_df = data[data['is_paid'] == True]
top_rated_paid = paid_df.groupby('subject') \
.apply(lambda x: x.sort_values(['num_subscribers'], ascending=False)) \
.reset_index(drop=True) \
.groupby('subject') \
.head(1)

top_rated_paid = top_rated_paid[['course_title',
                                 'content_duration',
                                 'published_timestamp',
                                 'num_subscribers',
                                'subject']]
top_rated_paid
top_rated_paid.style.background_gradient(cmap='coolwarm').set_precision(2)


### Top Free Courses

In [None]:
free_df = data[data['is_paid'] == False]
top_rated_free = free_df.groupby('subject') \
.apply(lambda x: x.sort_values(['num_subscribers'], ascending=False)) \
.reset_index(drop=True) \
.groupby('subject') \
.head(1)

top_rated_free = top_rated_free[['course_title',
                                 'content_duration',
                                 'published_timestamp',
                                 'num_subscribers',
                                'subject']]
top_rated_free
top_rated_free.style.background_gradient(cmap='coolwarm').set_precision(2)


In [None]:
fig = px.scatter_matrix(top_rated_paid,dimensions=["content_duration", "num_subscribers"], color="course_title")
fig.show()

In [None]:
data.columns

In [None]:
fig = px.scatter(data, x="num_subscribers", y="num_reviews", color="num_reviews", facet_col="is_paid",
           color_continuous_scale=px.colors.sequential.Viridis, render_mode="webgl")
fig.show()

In [None]:
fig = px.scatter(data, x="num_subscribers", y="num_lectures", animation_frame="year", animation_group="level"
        , color="level", hover_name="level", facet_col="level",
           log_x=True, size_max=45, range_x=[100,100000], range_y=[25,90])
fig.show()

In [None]:
p = data.sort_values(by=['year'])
p= p.head(200)
fig=px.bar(p,x='course_title', y="price", animation_frame="year", 
           animation_group="course_title", color="course_title", hover_name="course_title")
fig.update_layout(title='Price vs Courses')

In [None]:
fig = px.box(data, x="level", y="price", color="subject", notched=True)
fig.show()

In [None]:
fig = px.box(data, x="subject", y="price", color="subject", notched=True)
fig.show()

In [None]:
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(data['price'], palette = 'rainbow')
plt.title('Distribution of Courses Cost', fontsize = 20)
plt.show()
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(data['level'], palette = 'rainbow')
plt.title('Distribution of levels', fontsize = 20)
plt.show()
plt.rcParams['figure.figsize'] = (20, 8)
sns.countplot(data['is_paid'], palette = 'rainbow')
plt.title('Distribution of Paid/Free Courses', fontsize = 20)
plt.show()

In [None]:
plt.rcParams['figure.figsize'] = (30, 50)
sns.catplot(x="level", kind="count",hue ='subject', data=data, col='is_paid');
plt.show()

In [None]:
fig = px.scatter(data, x="num_lectures", y="num_reviews", color="is_paid", marginal_y="violin",
           marginal_x="box", trendline="ols")
fig.show()

In [None]:
fig = px.bar(data, x="price", y="num_lectures",color="is_paid")
fig.show()

In [None]:
fig = px.histogram(data, x="level", y="num_subscribers", color="subject",
                   marginal="violin")
fig.show()

In [None]:
fig = px.histogram(data, x="level", y="num_subscribers", color="is_paid",
                   marginal="box")
fig.show()

In [None]:
x = data['year']
y = data['num_subscribers']
plt.rcParams['figure.figsize'] = (20, 8)
sns.lineplot(x, y, color = 'blue')
plt.title('Year vs Number of Subscribers', fontsize = 10)
plt.show()

In [None]:
grss = data.groupby(["level","subject"])[["price"]].mean().reset_index()

fig = px.treemap(grss, path=["level","subject"], values='price',
                  color='price', hover_data=['price'],
                  color_continuous_scale='rainbow')
fig.show()

In [None]:
ms = data.sort_values(by=['num_subscribers'],ascending=False)
ms = ms.head(10)
fig = px.funnel(ms, x='price', y='course_title')
fig.show()

In [None]:
from wordcloud import WordCloud, ImageColorGenerator
text = " ".join(str(each) for each in data.course_title.unique())
wordcloud = WordCloud(max_words=200,colormap='Set2', background_color="white").generate(text)
plt.figure(figsize=(15,10))
plt.imshow(wordcloud, interpolation='Bilinear')
plt.axis("off")
plt.figure(1,figsize=(12, 12))
plt.show()


In [None]:
fig = px.parallel_categories(data, color="is_paid", color_continuous_scale=px.colors.sequential.solar)
fig.show()

## Phase II- Predictive Analysis
### Predicting Number of Reviews for Paid Udemy Courses

In [None]:
train_data =  data[data['is_paid'] == True]
train_data = train_data.drop(['course_id','url','published_timestamp','course_title','is_paid'],axis=1)
print("train size is : {}".format(train_data.shape))

In [None]:
corrmat = train_data.corr()
plt.subplots(figsize=(12,9))
sns.heatmap(corrmat, vmax=0.9, square=True)

In [None]:
train_df = pd.get_dummies(train_data, columns=['level','subject'])

In [None]:
# most correlated features
corrmat = train_df.corr()
top_corr_features = corrmat.index[abs(corrmat["num_reviews"])>0.5]
plt.figure(figsize=(10,10))
g = sns.heatmap(train_df[top_corr_features].corr(),annot=True,cmap="winter")

#### Skewness Test

In [None]:
from scipy import stats
from scipy.stats import norm, skew #for some statistics

sns.distplot(train_df['num_reviews'] , fit=norm);

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train_df['num_reviews'])
print( '\n mu = {:.2f} and sigma = {:.2f}\n'.format(mu, sigma))
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='best')
plt.ylabel('Frequency')
plt.title('Review distribution')

fig = plt.figure()
res = stats.probplot(train_df['num_reviews'], plot=plt)
plt.show()

In [None]:
train_df.num_reviews = np.log1p(train_df.num_reviews )

In [None]:
train_df.head()

In [None]:
y = train_df.num_reviews
train_df = train_df.drop(['num_reviews'],axis=1)
X_train,X_test,y_train,y_test = train_test_split(train_df,y,test_size = 0.1,random_state= 0)

In [None]:
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.metrics import mean_squared_error, make_scorer

In [None]:
n_folds = 5
from sklearn.metrics import make_scorer
from sklearn.model_selection import KFold
scorer = make_scorer(mean_squared_error,greater_is_better = False)
def rmse_CV_train(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(X_train.values)
    rmse = np.sqrt(-cross_val_score(model,X_train,y_train,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)
def rmse_CV_test(model):
    kf = KFold(n_folds,shuffle=True,random_state=42).get_n_splits(X_train.values)
    rmse = np.sqrt(-cross_val_score(model,X_test,y_test,scoring ="neg_mean_squared_error",cv=kf))
    return (rmse)

#### Baseline Model

In [None]:
lr = LinearRegression()
lr.fit(X_train,y_train)
test_pre = lr.predict(X_test)
train_pre = lr.predict(X_train)
print('rmse on train',rmse_CV_train(lr).mean())
print('rmse on train',rmse_CV_test(lr).mean())

In [None]:
plt.scatter(train_pre, train_pre - y_train, c = "blue",  label = "Training data")
plt.scatter(test_pre,test_pre - y_test, c = "black",  label = "Validation data")
plt.title("Linear regression")
plt.xlabel("Predicted values")
plt.ylabel("Residuals")
plt.legend(loc = "upper left")
plt.hlines(y = 0, xmin = 10.5, xmax = 13.5, color = "red")
plt.show()

In [None]:
clfs = []
seed = 3

clfs.append(("LinearRegression", 
             Pipeline([("Scaler", StandardScaler()),
                       ("LogReg", LinearRegression())])))

clfs.append(("XGB",
             Pipeline([("Scaler", StandardScaler()),
                       ("XGB", XGBRegressor())]))) 
clfs.append(("KNN", 
             Pipeline([("Scaler", StandardScaler()),
                       ("KNN", KNeighborsRegressor())]))) 

clfs.append(("DTR", 
             Pipeline([("Scaler", StandardScaler()),
                       ("DecisionTrees", DecisionTreeRegressor())]))) 

clfs.append(("RFRegressor", 
             Pipeline([("Scaler", StandardScaler()),
                       ("RandomForest", RandomForestRegressor())]))) 

clfs.append(("GBRegressor", 
             Pipeline([("Scaler", StandardScaler()),
                       ("GradientBoosting", GradientBoostingRegressor(max_features=15, 
                                                                       n_estimators=600))]))) 

clfs.append(("MLP", 
             Pipeline([("Scaler", StandardScaler()),
                       ("MLP Regressor", MLPRegressor())])))


clfs.append(("EXT Regressor",
             Pipeline([("Scaler", StandardScaler()),
                       ("ExtraTrees", ExtraTreeRegressor())])))
clfs.append(("SV Regressor",
             Pipeline([("Scaler", StandardScaler()),
                       ("ExtraTrees", SVR())])))

scoring = 'r2'
n_folds = 10
msgs = []
results, names  = [], [] 

for name, model  in clfs:
    kfold = KFold(n_splits=n_folds, random_state=seed)
    cv_results = cross_val_score(model, X_train, y_train, 
                                 cv=kfold, scoring=scoring, n_jobs=-1)    
    names.append(name)
    results.append(cv_results)    
    msg = "%s: %f (+/- %f)" % (name, cv_results.mean(),  
                               cv_results.std())
    msgs.append(msg)
    print(msg)

In [None]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [None]:
forest = RandomForestRegressor(n_estimators = 100,
                              criterion = 'mse',
                              random_state = 1,
                              n_jobs = -1)
forest.fit(X_train,y_train)
forest_train_pred = forest.predict(X_train)
forest_test_pred = forest.predict(X_test)

print('MSE train data: %.3f, MSE test data: %.3f' % (
mean_squared_error(y_train,forest_train_pred),
mean_squared_error(y_test,forest_test_pred)))
print('R2 train data: %.3f, R2 test data: %.3f' % (
r2_score(y_train,forest_train_pred),
r2_score(y_test,forest_test_pred)))
rms = sqrt(mean_squared_error(y_test, forest_test_pred))
print('Root mean Squared Error for Test Data {} '.format(rms))

### Please Show your Appreciation by Hitting Upvote 🤗!!!!