In [16]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
df = pd.read_csv('/content/drive/MyDrive/DA-CaseStudies/Datasets/udemy_courses.csv'  , parse_dates=['published_timestamp'])    # to parse a column into Datetime
df.sample(10)

In [None]:
df.dtypes

**1. Display Top 5 rows of the Dataset**

In [None]:
df.head(5)

**2. Check last 5 rows of the Dataset**

In [None]:
df.tail()

**3. Find Shape of our Dataset (rows and Columns)**

In [None]:
df.shape[0] , df.shape[1]

**4. Getting Information about our Dataset like Total Number rows, Total number of columns, Datatypes of each columns and Memory requirement**

In [None]:
df.info()

**5. Get Overall Statistics about the DataFrame**

In [None]:
df.describe()

In [None]:
df.describe(include=['object','bool'])

**6. Check Null Values in the Dataset**

In [None]:
df.isnull().values.any()

In [None]:
sns.heatmap(df.isnull())  # Visualizing

**7. Check Duplicate Data and Drop them**

In [None]:
df.duplicated().sum()

In [18]:
df.drop_duplicates(inplace=True)

**8. Find Out Number of Courses per subjects:**

In [None]:
df.groupby('subject')['course_id'].count()

In [None]:
# or
df['subject'].value_counts()

In [None]:
# Visualizing
sns.countplot(x=df['subject'] , data=df , palette='bright')
plt.title('Number of Courses per Subjects')
plt.xticks(rotation=65)
plt.xlabel('Subjects',fontsize=14)
plt.ylabel('Number of Courses per Subject',fontsize=14)
plt.show()

**9. For Which levels, Udemy Courses Providing The Courses**

In [None]:
df['level'].unique()

In [None]:
# or
df['level'].value_counts()

In [None]:
# Visualizing
sns.countplot(x=df['level'] , data=df , palette='bright')
plt.title('Number of Courses per Levels')
plt.xticks(rotation=65)
plt.xlabel('Levels' ,fontsize=14)
plt.ylabel('Number of Courses per Level' ,fontsize=14)
plt.show()

**10. Display the Count of paid and free Courses**

In [None]:
df['is_paid'].value_counts()

In [None]:
# Visualizing
sns.countplot(x=df['is_paid'] , data=df , palette='bright')

**11. Which Course has More lecture (Free and Paid)?**

In [None]:
df.groupby('is_paid')['num_lectures'].sum()

In [None]:
sns.barplot(data=df , x='is_paid' , y='num_lectures' , palette='bright')
plt.show()

**12. Which Courses have a higher Number of Subscribers Free or Paid ?**

In [None]:
df.groupby('is_paid')['num_subscribers'].mean()

In [None]:
sns.barplot(data=df , x='is_paid' , y='num_subscribers' , palette='bright')
plt.show()

**13. Which levels has the highest Number of Subscribers?**

In [None]:
df.groupby('level')['num_subscribers'].mean()

In [None]:
sns.barplot(data=df , x='level' , y='num_subscribers' , palette='bright')
plt.show()

**14. Find the Most Popular Course Title**

In [None]:
df[df['num_subscribers'].max() == df['num_subscribers']][['course_title','num_subscribers']]

In [None]:
# or
gp = df.groupby('course_title')['num_subscribers'].max().sort_values(ascending=False)

gp.head(1)

**15. Display 10 Most Popular Courses as per Number of Subscriber**

In [None]:
indexes = df.sort_values(by = ['num_subscribers'] ,ascending=False).head(10)

df.loc[indexes.index , ['course_title','num_subscribers']]

In [None]:
# Visualizing
sns.barplot(data=indexes , y='course_title' , x='num_subscribers' , palette='bright')
plt.xticks(rotation=90)
plt.show()

Another Method

In [None]:
gp.head(10)

In [None]:
# Visualizing
sns.barplot(y=gp.head(10).index , x=gp.head(10).values , palette='bright')
plt.xticks(rotation=90)
plt.show()

**16. Find the Course Which is Having the Highest Number of Reviews**

In [None]:
df[df['num_reviews'].max()==df['num_reviews']][['course_title' , 'num_reviews']]

**17. Does Price Affect Number of Reviews?**

In [None]:
sns.scatterplot(data=df , x='price' , y='num_reviews' , palette='bright')
plt.show()

# Yes

**18. Total Number of Courses Related to python**

In [None]:
df['course_title'].str.contains('Python' , case=False)

In [None]:
# To show Courses
df[df['course_title'].str.contains('Python')]['course_title']

**19. Display 10 Most Popular Python Courses as per Number of subcribers**

In [None]:
py = df[df['course_title'].str.contains('Python' , case=False)].sort_values(['num_subscribers'],ascending=False).head(10)[['course_title','num_subscribers']]
py
# always use 'by' method of sorting

In [None]:
sns.barplot(data=py , y='course_title' , x='num_subscribers' , palette='bright')
plt.xticks(rotation=90)
plt.show()

**20. In Which Year the Highest Number of  Courses were Posted ?**

In [None]:
df['Year'] = df['published_timestamp'].dt.year
df['Year']

In [None]:
df.groupby('Year')['course_id'].count()

In [None]:
# Visualizing
sns.countplot(data=df , x='Year' , palette='bright')
plt.show()

**21. Display Category wise count of posted Subjects[Year wise]**

In [None]:
pd.crosstab(df['Year'] , df['subject'])

In [None]:
df.groupby('Year')['subject'].value_counts()

In [None]:
# Visualizing
sns.countplot(data=df , x='Year' , hue='subject' , palette='bright')
plt.show()