In [66]:
#importing libraries
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
# import seaborn as sns
import plotly.express as px
from wordcloud import WordCloud, STOPWORDS


In [4]:
#reading the ted talk dataset (csv file)
df = pd.read_csv("tedtalk_data.csv")
df.head()

Unnamed: 0,title,author,date,views,likes,link
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...
3,Why is China appointing judges to combat clima...,James K. Thornton,October 2021,427000,12000,https://ted.com/talks/james_k_thornton_why_is_...
4,Cement's carbon problem — and 2 ways to fix it,Mahendra Singhi,October 2021,2400,72,https://ted.com/talks/mahendra_singhi_cement_s...


Questions to explore:
- top 10 viewed/liked vidoes ✅

- top 10 popular videos (views, likes, and watch time) ✅

- The authors with the most number of talks ✅

- graph author with the the title of their ted talks

- which month most of the ted talks were released 

- The newest & oldest ted talk in the dataset

- Visualization of number o ted talks every year



In [14]:
#(rows, columns)
df.shape

(5440, 6)

In [6]:
# name of the columns in the dataset
df.columns

Index(['title', 'author', 'date', 'views', 'likes', 'link'], dtype='object')

In [8]:
#each column type
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5440 entries, 0 to 5439
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   title   5440 non-null   object
 1   author  5439 non-null   object
 2   date    5440 non-null   object
 3   views   5440 non-null   int64 
 4   likes   5440 non-null   int64 
 5   link    5440 non-null   object
dtypes: int64(2), object(4)
memory usage: 255.1+ KB


In [12]:
#checking for NAs
df.isnull().sum()

title     0
author    1
date      0
views     0
likes     0
link      0
dtype: int64

Dealing with Missing values:

1- Delete the row with missing value

2- Imputing the Missing Value 

    - Replacing With Arbitrary Value

    - Replacing With Mean (not appropraite if there's outliers)

    - Replacing With Mode ( categorical features)

    - Replacing With Median (There's outliers)

    - Replacing with previous value – Forward fill (timeseries)

    - Replacing with next value – Backward fill (timeseries)

In [16]:
# viewing the row with null value
df[df['author'].isnull()]

Unnamed: 0,title,author,date,views,likes,link
3039,Year In Ideas 2015,,December 2015,532,15,https://ted.com/talks/year_in_ideas_2015


I clicked on the link of the ted talk to try to find the author of the ted talk (missing value), it turned out to be a compilation of ted talks in the year 2015, which is why I decided to deal with the NA by **Deleting it**.

In [17]:
#removing NA from our data
df.dropna(inplace=True)

In [19]:
#checking for duplicates 
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
5435    False
5436    False
5437    False
5438    False
5439    False
Length: 5439, dtype: bool

In [20]:
#Splitting the date column into months values and years values
df['year'] = pd.DatetimeIndex(df['date']).year
df['month'] = pd.DatetimeIndex(df['date']).month
df.head(3)

Unnamed: 0,title,author,date,views,likes,link,year,month
0,Climate action needs new frontline leadership,Ozawa Bineshi Albert,December 2021,404000,12000,https://ted.com/talks/ozawa_bineshi_albert_cli...,2021,12
1,The dark history of the overthrow of Hawaii,Sydney Iaukea,February 2022,214000,6400,https://ted.com/talks/sydney_iaukea_the_dark_h...,2022,2
2,How play can spark new ideas for your business,Martin Reeves,September 2021,412000,12000,https://ted.com/talks/martin_reeves_how_play_c...,2021,9


In [25]:
df['views'].describe().astype(int)

count        5439
mean      2061954
std       3567316
min          1200
25%        671000
50%       1300000
75%       2100000
max      72000000
Name: views, dtype: int64

In our Ted Talk dataset there's a **minimum of 1200** views, **maximum of 72,000,000** views, and the **average** number of views is **2,061,954**

---



In [26]:
df['likes'].describe().astype(int)

count       5439
mean       62619
std       107653
min           37
25%        20000
50%        41000
75%        65000
max      2100000
Name: likes, dtype: int64

In our Ted Talk dataset there's a **minimum of 37** likes, **maximum of 2,100,000** likes, and the **average** number of likes is **62,619**

In [44]:
top10views = df.sort_values(by = 'views', ascending = False).head(10)
top10views

Unnamed: 0,title,author,date,views,likes,link,year,month
5436,Do schools kill creativity?,Sir Ken Robinson,February 2006,72000000,2100000,https://ted.com/talks/sir_ken_robinson_do_scho...,2006,2
4084,Your body language may shape who you are,Amy Cuddy,June 2012,64000000,1900000,https://ted.com/talks/amy_cuddy_your_body_lang...,2012,6
2958,Inside the mind of a master procrastinator,Tim Urban,February 2016,60000000,1800000,https://ted.com/talks/tim_urban_inside_the_min...,2016,2
4765,How great leaders inspire action,Simon Sinek,September 2009,57000000,1700000,https://ted.com/talks/simon_sinek_how_great_le...,2009,9
4605,The power of vulnerability,Brené Brown,June 2010,56000000,1700000,https://ted.com/talks/brene_brown_the_power_of...,2010,6
3504,How to speak so that people want to listen,Julian Treasure,June 2013,49000000,1400000,https://ted.com/talks/julian_treasure_how_to_s...,2013,6
2168,My philosophy for a happy life,Sam Berns,October 2013,43000000,1300000,https://ted.com/talks/sam_berns_my_philosophy_...,2013,10
3251,The next outbreak? We're not ready,Bill Gates,March 2015,43000000,1300000,https://ted.com/talks/bill_gates_the_next_outb...,2015,3
3017,What makes a good life? Lessons from the longe...,Robert Waldinger,November 2015,41000000,1200000,https://ted.com/talks/robert_waldinger_what_ma...,2015,11
3994,"Looks aren't everything. Believe me, I'm a model.",Cameron Russell,October 2012,38000000,1100000,https://ted.com/talks/cameron_russell_looks_ar...,2012,10


In [52]:
top10liked = df.sort_values(by = 'likes', ascending = False).head(10)
top10liked

Unnamed: 0,title,author,date,views,likes,link,year,month
5436,Do schools kill creativity?,Sir Ken Robinson,February 2006,72000000,2100000,https://ted.com/talks/sir_ken_robinson_do_scho...,2006,2
4084,Your body language may shape who you are,Amy Cuddy,June 2012,64000000,1900000,https://ted.com/talks/amy_cuddy_your_body_lang...,2012,6
2958,Inside the mind of a master procrastinator,Tim Urban,February 2016,60000000,1800000,https://ted.com/talks/tim_urban_inside_the_min...,2016,2
4765,How great leaders inspire action,Simon Sinek,September 2009,57000000,1700000,https://ted.com/talks/simon_sinek_how_great_le...,2009,9
4605,The power of vulnerability,Brené Brown,June 2010,56000000,1700000,https://ted.com/talks/brene_brown_the_power_of...,2010,6
3504,How to speak so that people want to listen,Julian Treasure,June 2013,49000000,1400000,https://ted.com/talks/julian_treasure_how_to_s...,2013,6
2168,My philosophy for a happy life,Sam Berns,October 2013,43000000,1300000,https://ted.com/talks/sam_berns_my_philosophy_...,2013,10
3251,The next outbreak? We're not ready,Bill Gates,March 2015,43000000,1300000,https://ted.com/talks/bill_gates_the_next_outb...,2015,3
3017,What makes a good life? Lessons from the longe...,Robert Waldinger,November 2015,41000000,1200000,https://ted.com/talks/robert_waldinger_what_ma...,2015,11
3994,"Looks aren't everything. Believe me, I'm a model.",Cameron Russell,October 2012,38000000,1100000,https://ted.com/talks/cameron_russell_looks_ar...,2012,10


In [56]:
fig = px.bar(top10views, x="title", y=["views","likes"], title="Top 10 Popular videos based on Views & likes")
fig.show()

In [63]:
#Top 10 authors with the most number of talks
top10Authors=df['author'].value_counts().reset_index()
top10Authors.columns=['author','counts']
top10Authors = top10Authors.head(10)
top10Authors

Unnamed: 0,author,counts
0,Alex Gendler,45
1,Iseult Gillespie,33
2,Matt Walker,18
3,Alex Rosenthal,15
4,Elizabeth Cox,13
5,Emma Bryce,12
6,Juan Enriquez,11
7,Daniel Finkel,11
8,Jen Gunter,9
9,Greg Gage,9


In [65]:
fig = px.bar(top10Authors, x="author", y="counts", title="Top 10 authors with the most number of talks")
fig.show()

In [67]:
#top Months 
topMonth=df.value_counts('month').reset_index()
topMonth.columns=['month','counts']
topMonth

Unnamed: 0,month,counts
0,2,725
1,11,682
2,10,585
3,3,580
4,4,576
5,6,493
6,7,446
7,9,349
8,12,334
9,5,322


In [68]:
fig = px.bar(topMonth, x="month", y="counts", title="Number of Ted Talk releases each month")
fig.show()