In [1]:
import numpy as np
import pandas as pd 


import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [2]:
data = pd.read_csv('../input/ted-talks/data.csv')
data.head()

In [3]:
data.info()

In [4]:
data['month'] = pd.to_datetime(data['date'], format = '%B %Y').dt.month_name()
data['year'] = pd.to_datetime(data['date'], format = '%B %Y').dt.year
data.drop('date', axis = 1, inplace = True)

In [5]:
data.sample(5)

In [6]:
data.describe()

In [7]:
data.describe(include = 'object')

In [8]:
data.isnull().sum()

In [9]:
data.dropna(axis = 0, inplace = True)

## Exploratory Data Analysis

* **Finding the most popular TED talks**

In [10]:
# Top 10 most viewed and liked ted talks

most_popular_talks_views = data.sort_values(by='views', ascending=False)[:10]

most_popular_talks_likes = data.sort_values(by='likes', ascending=False)[:10]

In [11]:
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
# plot 1
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 1)
sns.lineplot(x = 'views', y ='title', data = most_popular_talks_views, color = 'red')
plt.title('Most Popular Ted Talks by Views')
plt.xlabel('Popularity by Views')
plt.ylabel('Title')
plt.show()

# plot 2
plt.figure(figsize=(15,5))
plt.subplot(1, 2, 2)
sns.lineplot(x = 'likes', y ='title', data = most_popular_talks_likes)
plt.title('Most Popular Ted Talks by Likes')
plt.xlabel('Popularity by Likes')
plt.ylabel('Title')
plt.show()


* **Finding the most popular TED talks Speaker (in terms of number of talks)**

In [13]:
author = data['author'].value_counts().sort_values(ascending = False).index[:10]
plt.figure(figsize=(15,10))
sns.countplot(x = 'author', data = data, order = author)
plt.title('Popularity of top 10 TED Talk Speakers based on their number of speeches')
plt.xlabel('Authors')
plt.ylabel('No. of TED Talks')
plt.show()

* **Month-wise Analysis of TED talk frequency**

In [14]:
plt.figure(figsize=(15,10))
order = data['month'].value_counts().sort_values(ascending = False).index
sns.countplot(x = 'month', data = data, order = order)
plt.show()


* **Year-wise Analysis of TED talk frequency**

In [15]:
plt.figure(figsize=(15,10))
index = data['year'].value_counts().index
values = data['year'].value_counts().values
sns.barplot(x = index, y = values, palette="deep")
plt.title('Year-wise Analysis of TED talk frequency')
plt.xlabel('Year')
plt.ylabel('No. of TED Talks')
plt.xticks(rotation = 90)
plt.show()

* **Finding TED talks of your favorite Author**

In [16]:
# My favourite TED Talk Speaker is Bill Gates so I am going with him.

bill_gates = data[data['author'] == 'Bill Gates']
bill_gates
plt.figure(figsize =(15,10))
sns.barplot(x = 'title', y = 'views',data = bill_gates)
plt.title('TED Talks by Steve Jobs')
plt.xlabel('TED Talks')
plt.ylabel('No. of Views')
plt.xticks(rotation = 45)
plt.show()

* **Finding TED talks with the best like to view ratio**

In [17]:
data['like_to_view'] = data['likes']/data['views']

In [18]:
top_like_to_view = data.sort_values(by = 'like_to_view', ascending = False)[:10]
plt.figure(figsize = (15,10))
sns.lineplot(y = 'title', x = 'like_to_view', data = top_like_to_view)
plt.xlabel('TED Talks')
plt.ylabel('Like_To_View_Ratio')
plt.title('TED talks with the best like to view ratio')
plt.xticks(rotation = 45)
plt.show()

* **Finding TED talks based on tags(like climate)**


In [19]:
climate_tag = data[data['title'].str.contains('climate')]
climate_tag

* **Finding the most popular TED talks Speaker (in terms of number of views)**

In [20]:
plt.figure(figsize=(15,5))
sns.barplot(x ='author', y = 'views', data = most_popular_talks_views)
plt.title('Most Popular Author by Views')
plt.xlabel('Popularity by Views')
plt.ylabel('Title')
plt.show()

In [21]:
import plotly.express as px

fig = px.treemap(most_popular_talks_views, path=['author'],values='views', width=800, height=400, hover_data = most_popular_talks_views)
fig.update_layout(margin = dict(t=50, l=25, r=25, b=25))
fig.show()