In [1]:
import pandas as pd

df = pd.read_csv("udemy_courses.csv")
df.head()


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18T20:58:58Z,Business Finance
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09T16:34:20Z,Business Finance
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19T19:26:30Z,Business Finance
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30T20:07:24Z,Business Finance
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13T14:57:18Z,Business Finance


In [2]:
# ✔️ Check basic info
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   course_id            3678 non-null   int64  
 1   course_title         3678 non-null   object 
 2   url                  3678 non-null   object 
 3   is_paid              3678 non-null   bool   
 4   price                3678 non-null   int64  
 5   num_subscribers      3678 non-null   int64  
 6   num_reviews          3678 non-null   int64  
 7   num_lectures         3678 non-null   int64  
 8   level                3678 non-null   object 
 9   content_duration     3678 non-null   float64
 10  published_timestamp  3678 non-null   object 
 11  subject              3678 non-null   object 
dtypes: bool(1), float64(1), int64(5), object(5)
memory usage: 319.8+ KB


In [3]:
# ✔️ Convert published_timestamp to datetime
df['published_timestamp'] = pd.to_datetime(df['published_timestamp'], errors='coerce')

# ✔️ Check again after conversion
print("Timestamp converted ✔")
df.info()


Timestamp converted ✔
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3678 entries, 0 to 3677
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype              
---  ------               --------------  -----              
 0   course_id            3678 non-null   int64              
 1   course_title         3678 non-null   object             
 2   url                  3678 non-null   object             
 3   is_paid              3678 non-null   bool               
 4   price                3678 non-null   int64              
 5   num_subscribers      3678 non-null   int64              
 6   num_reviews          3678 non-null   int64              
 7   num_lectures         3678 non-null   int64              
 8   level                3678 non-null   object             
 9   content_duration     3678 non-null   float64            
 10  published_timestamp  3678 non-null   datetime64[ns, UTC]
 11  subject              3678 non-null   object             
dty

In [4]:
# ✔️ Remove duplicates
df.drop_duplicates(inplace=True)

# ✔️ Create Revenue column (num_subscribers * price)
df['revenue'] = df['num_subscribers'] * df['price']

# ✔️ Standardize course levels
df['level'] = df['level'].str.strip().str.title()

# ✔️ Extract year of publication
df['published_year'] = df['published_timestamp'].dt.year

print("Data cleaning completed successfully ✔")

# Show first 5 rows
df.head()


Data cleaning completed successfully ✔


Unnamed: 0,course_id,course_title,url,is_paid,price,num_subscribers,num_reviews,num_lectures,level,content_duration,published_timestamp,subject,revenue,published_year
0,1070968,Ultimate Investment Banking Course,https://www.udemy.com/ultimate-investment-bank...,True,200,2147,23,51,All Levels,1.5,2017-01-18 20:58:58+00:00,Business Finance,429400,2017
1,1113822,Complete GST Course & Certification - Grow You...,https://www.udemy.com/goods-and-services-tax/,True,75,2792,923,274,All Levels,39.0,2017-03-09 16:34:20+00:00,Business Finance,209400,2017
2,1006314,Financial Modeling for Business Analysts and C...,https://www.udemy.com/financial-modeling-for-b...,True,45,2174,74,51,Intermediate Level,2.5,2016-12-19 19:26:30+00:00,Business Finance,97830,2016
3,1210588,Beginner to Pro - Financial Analysis in Excel ...,https://www.udemy.com/complete-excel-finance-c...,True,95,2451,11,36,All Levels,3.0,2017-05-30 20:07:24+00:00,Business Finance,232845,2017
4,1011058,How To Maximize Your Profits Trading Options,https://www.udemy.com/how-to-maximize-your-pro...,True,200,1276,45,26,Intermediate Level,2.0,2016-12-13 14:57:18+00:00,Business Finance,255200,2016


In [5]:
top_courses = df.nlargest(10, 'num_subscribers')[['course_title', 'num_subscribers']]
top_courses


Unnamed: 0,course_title,num_subscribers
2827,Learn HTML5 Programming From Scratch,268923
3032,Coding for Entrepreneurs Basic,161029
3230,The Web Developer Bootcamp,121584
2783,Build Your First Website in 1 Week with HTML5 ...,120291
3232,The Complete Web Developer Course 2.0,114512
1896,Free Beginner Electric Guitar Lessons,101154
2589,Web Design for Web Developers: Build Beautiful...,98867
2619,Learn Javascript & JQuery From Scratch,84897
3289,Practical PHP: Master the Basics and Code Dyna...,83737
3247,JavaScript: Understanding the Weird Parts,79612


In [6]:
subject_revenue = df.groupby('subject')['revenue'].sum().sort_values(ascending=False)
subject_revenue


subject
Web Development        627597400
Business Finance       123735315
Graphic Design          76983170
Musical Instruments     53359055
Name: revenue, dtype: int64

In [7]:
avg_price_level = df.groupby('level')['price'].mean().sort_values(ascending=False)
avg_price_level


level
Expert Level          91.120690
All Levels            73.090909
Intermediate Level    61.923990
Beginner Level        55.737382
Name: price, dtype: float64

In [8]:
yearly_enrollment = df.groupby('published_year')['num_subscribers'].sum()
yearly_enrollment


published_year
2011     119028
2012     555339
2013    1680153
2014    1930406
2015    3475324
2016    2966644
2017     988941
Name: num_subscribers, dtype: int64

In [9]:
top_instructors = df.groupby('subject')['revenue'].sum().sort_values(ascending=False).head(10)
top_instructors


subject
Web Development        627597400
Business Finance       123735315
Graphic Design          76983170
Musical Instruments     53359055
Name: revenue, dtype: int64

In [10]:
df.to_csv("EdTech_Cleaned.csv", index=False)
print("✔ Cleaned file exported successfully!")


✔ Cleaned file exported successfully!
