In [None]:
# Data manipulation libraries
import pandas as pd
import numpy as np

# Data visualization libraries
import matplotlib.pyplot as plt
%matplotlib inline
import matplotlib 
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
import mediapipe as mp

# setting style and rcparams
sns.set_style('darkgrid')
matplotlib.rcParams['font.size'] = 14
matplotlib.rcParams['figure.figsize'] = (7,4)
matplotlib.rcParams['figure.facecolor'] = '#00000000'

# Datatime library for Date columns
from datetime import datetime
import datetime as dt

# for remove Multicollinearity
from statsmodels.stats.outliers_influence import variance_inflation_factor

# Preprocessing libraries
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import PowerTransformer

# For build pipeline
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline


# Machine learning models
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso
from sklearn.linear_model import Ridge
from sklearn.linear_model import ElasticNet
from sklearn.tree import DecisionTreeRegressor

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor, AdaBoostRegressor
from sklearn.ensemble import VotingRegressor,StackingRegressor


# for plot decision tree
from sklearn import tree

# Model selection libraries
from sklearn.model_selection import cross_validate
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import RandomizedSearchCV

# importing XGB regressor
from xgboost import XGBRegressor

# Metrics libraries for model evaluation
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_absolute_error

# Warnings module handles warnings in Python
import warnings
warnings.filterwarnings('ignore')

In [None]:
excel_file = 'ted _data.xlsx'
sheet_name = 'Sheet1'
df = pd.read_excel(excel_file, sheet_name=sheet_name)

In [None]:
# checking first 3 rows

df.head(3)

In [None]:
# checking last 3 rows

df.tail(3)

In [None]:
# checking random sample of rows

df.sample(3)

In [None]:
#check information about all columns

df.info()

In [None]:
len(df)

In [None]:
# check null values

df.isnull().sum()

In [None]:
#checking duplicated values

df.duplicated().sum()

In [None]:
# describe the numerical dataset

df.describe().T

In [None]:
df.describe(percentiles=[.25,.50,.75,.80,.85,.90,.95,.96,.97,.98,.99])

In [None]:
tedtalks_df = df.copy()

In [None]:
tedtalks_df.head(3)

In [None]:
# fill missing value we focus more on feature engineering part.

values = {'transcript':'no data' , 'likes' : 'no data'}

tedtalks_df = tedtalks_df.fillna(value=values)


In [None]:

tedtalks_df.isnull().sum()

In [None]:
# change data-types of columns

tedtalks_df = tedtalks_df.astype({'views':'int32', 'duration':'int32'})
tedtalks_df['published_date']= pd.to_datetime(tedtalks_df['published_date'])
tedtalks_df['published_date'] = tedtalks_df['published_date'].dt.tz_localize(None)
tedtalks_df['recorded_date']= pd.to_datetime(tedtalks_df['recorded_date'])
tedtalks_df['recorded_date'] = tedtalks_df['recorded_date'].dt.tz_localize(None)

In [None]:
def convert_likes(likes_str):
    # print(type(likes_str))
    if likes_str == 'no data':
        return 0
    if type(likes_str) == int:
        return likes_str
    if type(likes_str) == float:
        return int(likes_str)
    elif 'K' in likes_str:
        # print(int(float(likes_str.replace('K', '')) * 1000))
        return int(float(likes_str.replace('K', '')) * 1000)
    elif 'M' in likes_str:
        return int(float(likes_str.replace('M', '')) * 1000000)
    else:
        return int(likes_str)

# Apply the function to the 'likes' column
tedtalks_df['likes'] = tedtalks_df['likes'].apply(convert_likes)

In [None]:
tedtalks_df.info()
tedtalks_df.to_excel('tedtalks_df_output_excel_file_unremoved.xlsx', index=False)

In [None]:
tedtalks_df.head(3)

In [None]:
tedtalks_df.shape

In [None]:
# remove columns
tedtalks_df.drop(['page_url', 'related_videos', 'summary','topics','transcript'], axis=1, inplace=True)

In [None]:
tedtalks_df.head(3)

In [None]:
# Function to extract speaker name from the JSON-like string
import json
def extract_speaker_name(speakers_data):
    try:
        # If speakers_data is a string, convert it to a list of dictionaries
        if isinstance(speakers_data, str):
            speakers_list = json.loads(speakers_data)
        else:
            speakers_list = speakers_data

        # Check if the list is not empty
        if speakers_list:
            # Extract the first dictionary from the list
            first_speaker = speakers_list[0]

            # Extract the 'name' key from the dictionary
            name = first_speaker.get('name')

            # Return the name
            return name
    except (json.JSONDecodeError, IndexError, TypeError, AttributeError, KeyError):
        return None


# Apply the function to the 'speaker' column to extract speaker names
tedtalks_df['speakers'] = tedtalks_df['speakers'].apply(extract_speaker_name)


In [None]:
tedtalks_df.sample(1)

In [None]:
tedtalks_df.to_excel('tedtalks_df_output_excel_file.xlsx', index=False)

In [None]:
# find popular talk show titles and speakers based on views

pop_talk = tedtalks_df[['title', 'speakers', 'views']].sort_values('views', ascending=False)[0:15]
pop_talk

### **`speaker`**

In [None]:
# create a dataframe with top 15 speakers by views
top15_views = tedtalks_df.groupby('speakers').views.sum().nlargest(15)
top15_views = top15_views.reset_index()

# create the figure and subplots
fig, axs = plt.subplots(figsize=(18,12))

# create a barplot with top 15 speakers by views
sns.barplot(x='views', y='speakers', data=top15_views)
axs.set_title('Top 15 Speakers by Views')

plt.tight_layout()
plt.show()


In [None]:
# unique speakers count of TEDX

len(tedtalks_df['speakers'].unique())

In [None]:
plt.figure(figsize=(10,6))
sns.scatterplot(x='likes', y='views', data=tedtalks_df)

In [None]:
# check distribution of views column

plt.figure(figsize=(10,5))
sns.distplot(tedtalks_df['views'], color ='green')

In [None]:
# check distribution of duration column

plt.figure(figsize=(10,5))
sns.distplot(tedtalks_df['duration'], color ='red')

In [None]:
# check correlation of duration and views using scatter plot

plt.figure(figsize=(10,6))
sns.scatterplot(x='duration', y='views', data=tedtalks_df)

- **observations :-**

- From the above graph it is clear that duration and views columns are not    correlated to each other.
- in duration column also some outlier there.

In [None]:
# checking duration and views columns Outliers using boxplot

columns = ['views', 'duration']
n = 1
plt.figure(figsize=(18,12))

for i in columns:
  plt.subplot(3,3,n)
  n=n+1
  sns.boxplot(tedtalks_df[i])
  plt.title(i)
  plt.tight_layout()

In [None]:
# treatment outliers by mean

columns = ['views', 'duration']

for i in columns:
  iqr =  tedtalks_df[i].quantile(0.75)-tedtalks_df[i].quantile(0.25)
  tedtalks_df[i] = tedtalks_df[i].mask(tedtalks_df[i]>(tedtalks_df[i].quantile(0.75)+1.5*iqr), tedtalks_df[i].mean())

In [None]:
#let's see the numerical column again after treating outliers

columns = ['views', 'duration']
n = 1
plt.figure(figsize=(18,12))

for i in columns:
  plt.subplot(3,3,n)
  n=n+1
  sns.boxplot(tedtalks_df[i])
  plt.title(i)
  plt.tight_layout()

In [None]:
# after fill outliers with mean... distribution graph of columns views and duration

fig, axs = plt.subplots(1, 2, figsize=(15, 5))

sns.distplot(tedtalks_df['views'], color='green', ax=axs[0])
axs[0].set_title('Distribution of Views')

sns.distplot(tedtalks_df['duration'], color='red', ax=axs[1])
axs[1].set_title('Distribution of Duration')


plt.tight_layout()
plt.show()

In [None]:
# change duration in sec. to min.

tedtalks_df['duration'] = tedtalks_df['duration'] / 60

In [None]:
# plot 3D scatter plot of comments, duration and our target column views using plotly library.

fig = px.scatter_3d(tedtalks_df, x=tedtalks_df['duration'], y=tedtalks_df['likes'], z=tedtalks_df['views'])

fig.show()

In [None]:
# Create a new column 'speaker_popularity' in the main DataFrame and assign the categories

tedtalks_df['speaker_popularity'] = ""
tedtalks_df.loc[tedtalks_df['views'] <= 500000, 'speaker_popularity'] = 'not_popular'
tedtalks_df.loc[(tedtalks_df['views'] > 500000) & (tedtalks_df['views'] <= 1500000), 'speaker_popularity'] = 'avg_popular'
tedtalks_df.loc[(tedtalks_df['views'] > 1500000) & (tedtalks_df['views'] <= 2500000), 'speaker_popularity'] = 'popular'
tedtalks_df.loc[(tedtalks_df['views'] > 2500000) & (tedtalks_df['views'] <= 3500000), 'speaker_popularity'] = 'high_popular'
tedtalks_df.loc[tedtalks_df['views'] > 3500000, 'speaker_popularity'] = 'extreme_popular'

# check the dataset

tedtalks_df.sample(2)

In [None]:
plt.figure(figsize=(18,6))
sns.barplot(data=tedtalks_df, x='speaker_popularity', y='views', 
            order=['not_popular', 'avg_popular', 'popular', 'high_popular', 'extreme_popular'])


### **`subtitle_languages`**

In [None]:
tedtalks_df.head(4)

In [None]:
# Function to extract subtitle language number from the JSON-like string
import json
def extract_languages_count(lang_data):
    try:
        # If speakers_data is a string, convert it to a list of dictionaries
        if isinstance(lang_data, str):
            speakers_list = json.loads(lang_data)
        else:
            speakers_list = lang_data

        # Check if the list is not empty
        if speakers_list:
            return int(len(speakers_list))
    except (json.JSONDecodeError, IndexError, TypeError, AttributeError, KeyError):
        return 0


# Apply the function to the 'subtitle' column to extract subtitle languages count
tedtalks_df['subtitle_languages'] = tedtalks_df['subtitle_languages'].apply(extract_languages_count)


In [None]:
tedtalks_df.head(3)

In [None]:
tedtalks_df.info()

In [None]:
tedtalks_df.to_excel('tedtalks_df_output_excel_file_processed.xlsx', index=False)

In [None]:
# add new column available_languages using existing column available_lang

pd.DataFrame(tedtalks_df['subtitle_languages'])

In [None]:
# check the distribution of this new column available_languages

plt.figure(figsize=(8,6))
sns.distplot(tedtalks_df['subtitle_languages'],color = 'darkblue')
plt.show()

In [None]:
# Making seperate column for day, month and year of upload

tedtalks_df['published_year'] = tedtalks_df['published_date'].dt.year
tedtalks_df['published_month'] = tedtalks_df['published_date'].dt.month
tedtalks_df['published_day'] = tedtalks_df["published_date"].dt.day_name()

# storing weekdays in order of numbers from 0 to 6 value

daydict = {'Sunday' : 0, 'Monday' : 1, 'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6}

# making new column holding information of day number

tedtalks_df['published_daynumber'] = tedtalks_df['published_day'].map(daydict)

In [None]:
# add one more column published_months_ago

tedtalks_df['published_months_ago'] = ((2024 - tedtalks_df['published_year'])*12 + tedtalks_df['published_month'])

In [None]:
tedtalks_df.head(3)

In [None]:
# there are lot of TED events

print(tedtalks_df['event'].value_counts().head(10))

In [None]:
tedtalks_df = tedtalks_df.astype({
    'speaker_popularity': 'category',
    'published_day': 'category',
    'event': 'category'
})

In [None]:
tedtalks_df.info()

In [None]:
tedtalks_df.to_excel('tedtalks_df_output_excel_file_processed_last.xlsx', index=False)

In [None]:
# dropping unneccessary columns

tedtalks_df.drop(labels = ["speakers", "youtube_video_code", "title"],axis = 1, inplace = True)

In [None]:
#  "recorded_date", "published_date"
 # dropping unneccessary columns

tedtalks_df.drop(labels = ["recorded_date", "published_date"],axis = 1, inplace = True)

In [None]:
tedtalks_df.drop(labels = ["event"],axis = 1, inplace = True)

In [None]:
tedtalks_df.drop(labels = ["published_day"],axis = 1, inplace = True)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


# Select only numeric columns
numeric_df = tedtalks_df.select_dtypes(include=[float, int])

# Alternatively, if you want to include categorical data, use one-hot encoding
# processed_df = pd.get_dummies(tedtalks_df)
# correlmap = processed_df.corr()

# Calculate the correlation matrix on the numeric DataFrame
correlmap = numeric_df.corr()

# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(10, 8))

# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(correlmap, annot=True, fmt='.2f', cmap='coolwarm', ax=ax)

# Show the plot
plt.show()