In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

df = pd.read_csv('data.csv')

# df.info()

In [2]:
df_copy = df.copy()
df_copy = df.drop(['Video Link','Channel URL','Date of Video Upload','Date of the Last Comment',
                   'No of Playlist','Subtitle','Creator Name','No of Comments','Duration of Video','Intern Who Collected the Data'], axis=1)

## Handling Missing Values

In [3]:
df_copy.isnull().sum()

Video Views                                0
Video Title                                0
Creator Gender                           314
Total Channel Subcribers                   0
Total Chanel Views                         0
Duration in Seconds                        0
No of Likes                                1
Language of the Video                     21
Video Description                          0
Hashtags                                   0
Maximum Quality of the Video               0
No of Videos the Channel                   0
Premiered or Not                           0
Community Engagement (Posts per week)      0
dtype: int64

In [4]:
# Replacing NaN with Orgnization (Org) in Creater Name column
df_copy['Creator Gender'] = df_copy['Creator Gender'].replace(np.nan, 'Org')

In [5]:
df_copy.head()

Unnamed: 0,Video Views,Video Title,Creator Gender,Total Channel Subcribers,Total Chanel Views,Duration in Seconds,No of Likes,Language of the Video,Video Description,Hashtags,Maximum Quality of the Video,No of Videos the Channel,Premiered or Not,Community Engagement (Posts per week)
0,146059,Samse - Didupe Off-road Royal Enfield Himalayan,Male,40200,7964284,1841.0,6700.0,Kannada,Yes,0,1080,462,No,1
1,231969,Bhoom Baddhal Web Series,Male,920000,137535681,864.0,7800.0,Telugu,Yes,1,1080,251,Yes,2
2,878,"Customer Psychology, Four Views Of Consumer De...",Female,458,72945,492.0,858.0,Hindi,Yes,0,720,158,No,0
3,9965,Top countries with proven natural gas reserves,Org,1680000,541653219,66.0,188.0,English,Yes,0,1080,46509,No,20
4,39780561,73 Questions With Selena Gomez,Org,12100000,3600496171,460.0,901000.0,English,Yes,0,1080,2872,No,1


In [6]:
# Dropping instances with null values
df_copy = df_copy.dropna(how='any',axis=0)

In [7]:
df_copy.isnull().sum()

Video Views                              0
Video Title                              0
Creator Gender                           0
Total Channel Subcribers                 0
Total Chanel Views                       0
Duration in Seconds                      0
No of Likes                              0
Language of the Video                    0
Video Description                        0
Hashtags                                 0
Maximum Quality of the Video             0
No of Videos the Channel                 0
Premiered or Not                         0
Community Engagement (Posts per week)    0
dtype: int64

## Handling Outliers

In [9]:
continuous_features = ['Video Views', 'Total Channel Subcribers', 'Total Chanel Views', 'Duration in Seconds', 
                      'No of Likes', 'No of Videos the Channel', 'Hashtags', 'Community Engagement (Posts per week)']

categorical_features = ['Video Title', 'Creator Gender', 'Language of the Video', 'Video Description',  
                        'Maximum Quality of the Video','Premiered or Not']

In [10]:
df_copy = df_copy.replace(',','', regex=True)

In [11]:
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 883 entries, 0 to 904
Data columns (total 14 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   Video Views                            883 non-null    object 
 1   Video Title                            883 non-null    object 
 2   Creator Gender                         883 non-null    object 
 3   Total Channel Subcribers               883 non-null    int64  
 4   Total Chanel Views                     883 non-null    object 
 5   Duration in Seconds                    883 non-null    object 
 6   No of Likes                            883 non-null    float64
 7   Language of the Video                  883 non-null    object 
 8   Video Description                      883 non-null    object 
 9   Hashtags                               883 non-null    int64  
 10  Maximum Quality of the Video           883 non-null    int64  
 11  No of 

In [12]:
def clamping(col, i):
    if i== 0 or i == 2 or i == 3:
        data = df_copy[col]
        data = pd.Series(data)
        data = pd.to_numeric(data)    
    else: 
        data = df_copy[col]
    q1 = np.percentile(data,25)
    q3 = np.percentile(data, 75)
    iqr = q3 - q1
    upperLimit = q3 + (1.5*iqr)
    lowerLimit = q1 - (1.5*iqr)
    for index, val in enumerate(data):
        if val > upperLimit:
            data.loc[index] = upperLimit
            continue
        if val < lowerLimit:
            data.loc[index] = lowerLimit
            continue
    i = i + 1
   
         
for col in continuous_features:
    clamping(col, 0)

## Normalizing Data

In [13]:
from sklearn.preprocessing import MinMaxScaler

In [22]:
data = df_copy.drop(['Video Title', 'Creator Gender', 'Language of the Video', 'Video Description', 
                        'Maximum Quality of the Video','Premiered or Not', 'Total Chanel Views'], axis='columns')
scaler = MinMaxScaler()
data = scaler.fit_transform(data)

continuous_df = pd.DataFrame(data, columns=['Total Channel Subcribers',
                                        'Total Chanel Views','Duration in Seconds','No of Likes', 'Hashtags',
                                        'No of Videos the Channel','Community Engagement (Posts per week)'  ])
continuous_df.head()

Unnamed: 0,Total Channel Subcribers,Total Chanel Views,Duration in Seconds,No of Likes,Hashtags,No of Videos the Channel,Community Engagement (Posts per week)
0,1.827508e-05,0.000132,0.029362,0.000137,0.0,0.001098,0.002933
1,2.902594e-05,0.003014,0.01378,0.000159,0.035714,0.000595,0.005865
2,1.069955e-07,1e-06,0.007847,1.8e-05,0.0,0.000374,0.0
3,1.244151e-06,0.005504,0.001053,4e-06,0.0,0.110734,0.058651
4,0.004978174,0.039646,0.007337,0.009277,0.0,0.006836,0.002933


## Updated Dataset