We are going to createa a dummy dataset for some Youtubers regarding their channals. We will create data including:

Decide the size of the dataset by using a SIZE constant variable
ChannelID (Y0001, Y0002, ...)
Number of subscribers
Number of videos
Number of total views
Category (['music','news','gaming','food','travel'])
Language (['English','Spanish','Japanese','Franch','Russia','Chinese'])
Type(['Corperate','Goverment','NGO','Individual'])
Then we will play with the dummy dataset with our knowledge of pandas, including:

Access
Sampling
Filtering
Aggregation
Manipulation

#Imports

In [1]:
import numpy as np
import pandas as pd
import random


Dummy dataset generation

Setup SIZE

In [2]:
#We make this variable SIZE because it supposed to be constant for the runtime.

SIZE = 1000

Generate ChannelID

In [8]:
#ids = [f'Y{i:04d}' for i in range(1,SIZE+1)]
ids = ['Y{0:04d}'.format(x) for x in range(SIZE)]
ids[:5], ids[-5:]
#type(ids)

(['Y0000', 'Y0001', 'Y0002', 'Y0003', 'Y0004'],
 ['Y0995', 'Y0996', 'Y0997', 'Y0998', 'Y0999'])

Generate # of subscribers

In [9]:
#We first simulate a normal distribution

subs = np.random.normal(100000, 50000, (SIZE))
subs[:5], subs[-5:]

(array([122460.11861087,  64265.95201075,  85150.14715035,  49281.25824157,
        113275.89434124]),
 array([ 43526.50581876,  73596.30054517, 156953.94144878, 115637.63309297,
        141901.25568107]))

In [10]:
#We convert the float numbers to integer

subs = subs.astype(int)
subs[:5], subs[-5:]

(array([122460,  64265,  85150,  49281, 113275]),
 array([ 43526,  73596, 156953, 115637, 141901]))

In [16]:
#We check if there are non-positive numbers of subscribers
subs[subs <= 0 ]

array([ -4311, -21680, -33247, -11042,  -4250, -11239,  -9382,  -4434,
       -14128, -34651, -13649, -10313, -31065, -11881, -40340,  -4615,
        -5711, -20210,  -2492, -27948])

In [17]:
#We set them to be 1
subs[subs <= 0 ]=1
subs[subs <= 0 ]


array([], dtype=int64)

Generate number of videos

In [19]:
#We use a uniform distribution this time (just for practice)



nvideos = [np.random.randint(1, 100) for i in range(SIZE)]
nvideos[:5], nvideos[-5:]

([93, 41, 40, 13, 50], [48, 20, 65, 36, 60])

We simulate the number of views

In [20]:
#To make it real, we will use the # of subscribers and # of videos as factors to get the # of total views


views = [ int(x*1.5*np.random.random() + y*2*np.random.random() + np.random.randint(-1000, 1000))
for x, y in zip(subs, nvideos)]
views[:5], views[-5:]

([18074, 17181, 110777, 28008, 111935], [36761, 73421, 193939, 122799, 149907])

In [21]:
views = np.array(views)
views

array([ 18074,  17181, 110777,  28008, 111935,  31964,  48303, 110287,
        26484, 190713,  67351,  44381,  97442,   2480, 170126, 183387,
        19662,  90387,  74943,  34812,  49072, 126143,  27158, 102907,
        91345, 213399,  26778,   9756,  46109,    505,  54608,  92141,
        20244,  67147,  57997,   1295, 140670,  29017,  45711,  86940,
           64,   -683,   3456, 133857, 165503, 280191,  81773,  61508,
        49786,   3045,  80240, 166498,  32421,  73964,  42161,   8024,
        10081, 104226, 158459,  37955,  16085, 123456, 109114,   4072,
        34475,  50094,  10692,  16480,  74738, 196988,  24111,  74759,
       145346,   4914, 171694, 156144,  97251,  23609,   -921,  77831,
       107167, 162948,  64337,  95748, 166540, 228265, 100255,  17259,
       116710,  62931,  95338,  46639,  19017,  32894,  16909,  93672,
        32875,  26701,  29305,  92737,  62227,   7947,  76487, 103209,
       108609,  56695,  79283, 111024,  21879, 105337,  50141,  60530,
      

In [22]:
views[views < 0]

array([-683, -921, -692,  -85, -254, -536, -497, -371, -748, -506, -748])

In [23]:
views[views < 0] = 0
views[views < 0]

array([], dtype=int64)

Category
Category (['music','news','gaming','food','travel'])

In [24]:
category = np.random.choice(['music','news','gaming','food','travel'], SIZE)
category[:20]

array(['music', 'news', 'food', 'music', 'news', 'news', 'news', 'travel',
       'food', 'music', 'news', 'news', 'travel', 'music', 'travel',
       'food', 'music', 'travel', 'travel', 'music'], dtype='<U6')

Language
Language (['English','Spanish','Japanese','Franch','Russia','Chinese'])

In [25]:
language = np.random.choice(['English','Spanish','Japanese','Franch','Russia','Chinese'],
                            SIZE, p=[0.5, 0.1, 0.1, 0.1, 0.1, 0.1])
language[:20]

array(['Russia', 'Chinese', 'English', 'Chinese', 'Spanish', 'Spanish',
       'English', 'English', 'Chinese', 'English', 'English', 'Franch',
       'Franch', 'Franch', 'Japanese', 'Japanese', 'English', 'English',
       'English', 'English'], dtype='<U8')

Type
Type(['Corperate','Goverment','NGO','Individual'])

In [27]:
tp = np.random.choice(['Corperate','Goverment','NGO','Individual'], SIZE, p=[0.1, 0.2, 0.3, 0.4])
tp[:20]



numpy.ndarray

#Now we have all attributes, let's put them into a dataframe

In [28]:
df = pd.DataFrame({'ChannelID': ids,
                   'subs': subs,
                   'nvideos': nvideos,
                   'views': views,
                   'Category': category,
                   'Language':language,
                   'Type': tp},
                  index = ids)
df.head()

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,122460,93,18074,music,Russia,NGO
Y0001,Y0001,64265,41,17181,news,Chinese,Goverment
Y0002,Y0002,85150,40,110777,food,English,NGO
Y0003,Y0003,49281,13,28008,music,Chinese,Individual
Y0004,Y0004,113275,50,111935,news,Spanish,NGO


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1000 entries, Y0000 to Y0999
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   ChannelID  1000 non-null   object
 1   subs       1000 non-null   int64 
 2   nvideos    1000 non-null   int64 
 3   views      1000 non-null   int64 
 4   Category   1000 non-null   object
 5   Language   1000 non-null   object
 6   Type       1000 non-null   object
dtypes: int64(3), object(4)
memory usage: 62.5+ KB


Let's save the dummy dataset to youtube_channels.csv

In [30]:
df.to_csv('/content/youtube_channels.csv', index=False)

Let's play with the dataframe a little bit

Sampling

In [32]:
#Select certain rows
df_sub1 = df.loc[:'Y0100']
df_sub1

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,122460,93,18074,music,Russia,NGO
Y0001,Y0001,64265,41,17181,news,Chinese,Goverment
Y0002,Y0002,85150,40,110777,food,English,NGO
Y0003,Y0003,49281,13,28008,music,Chinese,Individual
Y0004,Y0004,113275,50,111935,news,Spanish,NGO
...,...,...,...,...,...,...,...
Y0096,Y0096,114297,23,32875,news,English,Goverment
Y0097,Y0097,22654,31,26701,music,Spanish,Individual
Y0098,Y0098,58042,11,29305,food,English,Goverment
Y0099,Y0099,145340,64,92737,music,Russia,Individual


In [36]:
#Select certain rows and columns


df_sub2 = df.loc[:'Y1000', ['subs','views']]
df_sub2

Unnamed: 0,subs,views
Y0000,122460,18074
Y0001,64265,17181
Y0002,85150,110777
Y0003,49281,28008
Y0004,113275,111935
...,...,...
Y0995,43526,36761
Y0996,73596,73421
Y0997,156953,193939
Y0998,115637,122799


#### Select random rows

In [38]:
df_sub3 = df.iloc[random.sample(range(0, SIZE), 100)]
df_sub3

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0729,Y0729,132893,7,71571,music,Chinese,Individual
Y0150,Y0150,1,74,385,food,Spanish,NGO
Y0339,Y0339,231220,85,332943,gaming,Franch,NGO
Y0531,Y0531,23218,57,11225,travel,English,Goverment
Y0047,Y0047,70677,74,61508,music,Franch,Individual
...,...,...,...,...,...,...,...
Y0093,Y0093,174777,12,32894,gaming,Russia,Corperate
Y0650,Y0650,7350,87,8205,music,English,Individual
Y0209,Y0209,148931,18,35911,music,English,Individual
Y0191,Y0191,61867,90,87740,food,English,Individual


Select random rows with selected columns

In [39]:
df_sub4 = df.iloc[random.sample(range(0, SIZE), 100)][['subs','views']]
df_sub4

Unnamed: 0,subs,views
Y0947,179917,266672
Y0423,212423,228203
Y0982,139324,102623
Y0528,56970,9676
Y0029,1,505
...,...,...
Y0422,141306,81981
Y0193,103390,105849
Y0160,59061,62121
Y0735,108645,37720


Filtering

In [42]:
#Super Popular Channel
df_superp = df[df['subs'] > 300000]
df_superp

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type


In [43]:
#Popular Channel
df_p = df[df['subs'] > 100000]
df_p

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,122460,93,18074,music,Russia,NGO
Y0004,Y0004,113275,50,111935,news,Spanish,NGO
Y0007,Y0007,122158,96,110287,travel,English,NGO
Y0009,Y0009,186979,65,190713,music,English,Goverment
Y0010,Y0010,138048,24,67351,news,English,Individual
...,...,...,...,...,...,...,...
Y0990,Y0990,165827,30,236227,news,Russia,NGO
Y0994,Y0994,179903,20,237590,travel,English,Individual
Y0997,Y0997,156953,65,193939,travel,Russia,Goverment
Y0998,Y0998,115637,36,122799,gaming,English,Individual


In [44]:
#Start up channel
df_begin = df[df['subs'] < 100]
df_begin

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0029,Y0029,1,81,505,gaming,Franch,Individual
Y0040,Y0040,1,35,64,travel,English,Goverment
Y0078,Y0078,1,21,0,food,Spanish,Goverment
Y0144,Y0144,1,70,133,gaming,English,NGO
Y0150,Y0150,1,74,385,food,Spanish,NGO
Y0156,Y0156,1,36,733,food,Spanish,Corperate
Y0185,Y0185,1,38,696,news,English,NGO
Y0226,Y0226,1,73,227,gaming,Russia,Goverment
Y0420,Y0420,1,78,0,news,Japanese,NGO
Y0429,Y0429,1,93,703,travel,Spanish,Goverment


In [45]:
#Popular English Channel

df_EnglishP = df[(df['subs'] > 100000) & (df['Language'] == 'English')]
df_EnglishP

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0007,Y0007,122158,96,110287,travel,English,NGO
Y0009,Y0009,186979,65,190713,music,English,Goverment
Y0010,Y0010,138048,24,67351,news,English,Individual
Y0018,Y0018,107374,58,74943,travel,English,NGO
Y0028,Y0028,107841,63,46109,food,English,Goverment
...,...,...,...,...,...,...,...
Y0985,Y0985,151016,26,170639,gaming,English,Goverment
Y0986,Y0986,121113,77,75065,news,English,NGO
Y0994,Y0994,179903,20,237590,travel,English,Individual
Y0998,Y0998,115637,36,122799,gaming,English,Individual


In [46]:
#Gaming channel with many videos

df_gaming_nv = df[(df['Category'] == 'gaming') & (df['nvideos'] > 90)]
df_gaming_nv

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0167,Y0167,136453,91,133661,gaming,Franch,NGO
Y0169,Y0169,45870,96,56074,gaming,English,Individual
Y0183,Y0183,45351,91,46862,gaming,English,Individual
Y0231,Y0231,73754,94,13274,gaming,English,NGO
Y0346,Y0346,121423,97,119164,gaming,Russia,Goverment
Y0378,Y0378,146558,99,128450,gaming,Russia,Corperate
Y0602,Y0602,77263,93,103259,gaming,Franch,Individual
Y0676,Y0676,174013,94,121914,gaming,English,NGO
Y0677,Y0677,94314,95,86264,gaming,English,NGO
Y0687,Y0687,44296,91,32548,gaming,English,Goverment


In [51]:
#Non-Corperate and News channel

df_nc_news = df[(df['Category'] == 'news') & (df['Type'] != 'Corperate')]
df_nc_news

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0001,Y0001,64265,41,17181,news,Chinese,Goverment
Y0004,Y0004,113275,50,111935,news,Spanish,NGO
Y0005,Y0005,68234,95,31964,news,Spanish,NGO
Y0010,Y0010,138048,24,67351,news,English,Individual
Y0011,Y0011,118663,62,44381,news,Franch,NGO
...,...,...,...,...,...,...,...
Y0987,Y0987,95033,28,141984,news,Franch,Goverment
Y0990,Y0990,165827,30,236227,news,Russia,NGO
Y0991,Y0991,69035,34,32472,news,Franch,Individual
Y0995,Y0995,43526,48,36761,news,Chinese,Individual


In [55]:
#Top Subs and Top nvideos

df_subs_and_nvideos = df[(df['subs'] >100000) & (df['nvideos'] > 90)]
df_subs_and_nvideos

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,122460,93,18074,music,Russia,NGO
Y0007,Y0007,122158,96,110287,travel,English,NGO
Y0088,Y0088,138064,97,116710,travel,English,Individual
Y0117,Y0117,111517,93,74162,food,Franch,Individual
Y0162,Y0162,120231,95,79540,music,Franch,Goverment
Y0164,Y0164,146395,91,106909,news,Spanish,Goverment
Y0167,Y0167,136453,91,133661,gaming,Franch,NGO
Y0203,Y0203,123361,96,72530,travel,Russia,Individual
Y0215,Y0215,198104,93,12242,music,Japanese,Goverment
Y0243,Y0243,151046,97,163828,news,Franch,Corperate


In [56]:
#Top Subs or Top nvideos

df_subs_or_nvideos = df[(df['subs'] > 200000) | (df['nvideos'] > 90)]
df_subs_or_nvideos

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type
Y0000,Y0000,122460,93,18074,music,Russia,NGO
Y0005,Y0005,68234,95,31964,news,Spanish,NGO
Y0007,Y0007,122158,96,110287,travel,English,NGO
Y0012,Y0012,76694,92,97442,travel,Franch,Individual
Y0017,Y0017,69740,99,90387,travel,English,Individual
...,...,...,...,...,...,...,...
Y0966,Y0966,68798,93,68929,gaming,Franch,Individual
Y0972,Y0972,96335,93,142660,travel,Spanish,NGO
Y0975,Y0975,111152,93,52768,news,English,Individual
Y0981,Y0981,217864,9,6993,travel,Chinese,Individual


Aggregation

In [58]:
byCategory = df.groupby('Category')
byCategory.sum(numeric_only=True)

Unnamed: 0_level_0,subs,nvideos,views
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
food,19149592,8871,14079360
gaming,19301728,9704,15309444
music,23022639,10984,16402174
news,18392500,9578,12595556
travel,19424211,9466,15100245


In [60]:
byCategory.mean(numeric_only=True)

Unnamed: 0_level_0,subs,nvideos,views
Category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
food,105217.538462,48.741758,77359.120879
gaming,96028.497512,48.278607,76166.38806
music,100976.486842,48.175439,71939.359649
news,95297.927461,49.626943,65261.948187
travel,99103.117347,48.295918,77042.066327


In [62]:
byLanguage = df.groupby('Language')
byLanguage.sum(numeric_only=True)

Unnamed: 0_level_0,subs,nvideos,views
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chinese,10711714,4872,7473201
English,45210230,21700,32066935
Franch,11823147,6559,9901118
Japanese,10965271,5401,8471395
Russia,10369327,5339,7814154
Spanish,10210981,4732,7759976


In [63]:
byLanguage.mean(numeric_only=True)

Unnamed: 0_level_0,subs,nvideos,views
Language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chinese,107117.14,48.72,74732.01
English,97646.285097,46.868251,69259.038877
Franch,96911.040984,53.762295,81156.704918
Japanese,98786.225225,48.657658,76318.873874
Russia,96909.598131,49.897196,73029.476636
Spanish,105267.845361,48.783505,79999.752577


In [64]:
byLanType = df.groupby(['Language', 'Type'])
byLanType.describe()

Unnamed: 0_level_0,Unnamed: 1_level_0,subs,subs,subs,subs,subs,subs,subs,subs,nvideos,nvideos,nvideos,nvideos,nvideos,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Language,Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Chinese,Corperate,15.0,126819.066667,41413.796377,72127.0,91369.0,132241.0,145155.5,205449.0,15.0,62.333333,...,82.5,95.0,15.0,90431.6,49567.727179,1443.0,74033.0,88050.0,104123.0,191112.0
Chinese,Goverment,15.0,102198.133333,49648.719885,9239.0,75222.5,88438.0,135980.5,181514.0,15.0,54.6,...,75.5,97.0,15.0,77803.0,61098.970398,6775.0,43759.5,60419.0,107982.5,216932.0
Chinese,Individual,46.0,105836.347826,46338.677201,15749.0,72558.0,101335.5,140466.75,217864.0,46.0,39.630435,...,57.0,93.0,46.0,71397.304348,59692.536774,3693.0,26726.75,48228.5,105063.5,213399.0
Chinese,NGO,24.0,100332.666667,47273.2974,1.0,68430.5,87909.0,135331.5,191458.0,24.0,53.958333,...,78.25,92.0,24.0,69391.916667,54421.309269,475.0,22302.0,61417.0,95581.0,191188.0
English,Corperate,38.0,100731.263158,51007.076046,12722.0,66188.75,99043.0,121197.75,213123.0,38.0,44.789474,...,68.5,97.0,38.0,77702.026316,53722.469991,3781.0,33335.75,78049.5,108813.75,216185.0
English,Goverment,99.0,99979.171717,47252.836589,1.0,66997.5,96216.0,139444.5,196079.0,99.0,46.474747,...,62.0,97.0,99.0,70682.707071,51772.829966,0.0,30014.0,60742.0,103017.5,212767.0
English,Individual,179.0,100489.385475,47097.652594,1918.0,63322.5,97116.0,137661.0,215402.0,179.0,47.826816,...,69.5,99.0,179.0,72584.324022,61462.828792,367.0,20098.5,59022.0,103556.0,266672.0
English,NGO,147.0,91815.673469,47077.074768,1.0,57658.5,92360.0,129148.5,201254.0,147.0,46.503401,...,70.0,99.0,147.0,62068.544218,53603.487108,0.0,17991.5,45258.0,102698.0,237781.0
Franch,Corperate,12.0,109716.5,44591.129149,53494.0,77868.25,91681.5,145076.75,187366.0,12.0,47.666667,...,75.0,97.0,12.0,105529.416667,63010.024913,19662.0,56536.25,100317.5,148983.25,200945.0
Franch,Goverment,17.0,103998.705882,48133.043254,13553.0,77194.0,99876.0,129875.0,188560.0,17.0,53.0,...,71.0,95.0,17.0,87208.352941,70447.093148,6082.0,47154.0,64337.0,123509.0,280981.0


In [65]:
byLanType.describe()[['subs','views']]

Unnamed: 0_level_0,Unnamed: 1_level_0,subs,subs,subs,subs,subs,subs,subs,subs,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Language,Type,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2
Chinese,Corperate,15.0,126819.066667,41413.796377,72127.0,91369.0,132241.0,145155.5,205449.0,15.0,90431.6,49567.727179,1443.0,74033.0,88050.0,104123.0,191112.0
Chinese,Goverment,15.0,102198.133333,49648.719885,9239.0,75222.5,88438.0,135980.5,181514.0,15.0,77803.0,61098.970398,6775.0,43759.5,60419.0,107982.5,216932.0
Chinese,Individual,46.0,105836.347826,46338.677201,15749.0,72558.0,101335.5,140466.75,217864.0,46.0,71397.304348,59692.536774,3693.0,26726.75,48228.5,105063.5,213399.0
Chinese,NGO,24.0,100332.666667,47273.2974,1.0,68430.5,87909.0,135331.5,191458.0,24.0,69391.916667,54421.309269,475.0,22302.0,61417.0,95581.0,191188.0
English,Corperate,38.0,100731.263158,51007.076046,12722.0,66188.75,99043.0,121197.75,213123.0,38.0,77702.026316,53722.469991,3781.0,33335.75,78049.5,108813.75,216185.0
English,Goverment,99.0,99979.171717,47252.836589,1.0,66997.5,96216.0,139444.5,196079.0,99.0,70682.707071,51772.829966,0.0,30014.0,60742.0,103017.5,212767.0
English,Individual,179.0,100489.385475,47097.652594,1918.0,63322.5,97116.0,137661.0,215402.0,179.0,72584.324022,61462.828792,367.0,20098.5,59022.0,103556.0,266672.0
English,NGO,147.0,91815.673469,47077.074768,1.0,57658.5,92360.0,129148.5,201254.0,147.0,62068.544218,53603.487108,0.0,17991.5,45258.0,102698.0,237781.0
Franch,Corperate,12.0,109716.5,44591.129149,53494.0,77868.25,91681.5,145076.75,187366.0,12.0,105529.416667,63010.024913,19662.0,56536.25,100317.5,148983.25,200945.0
Franch,Goverment,17.0,103998.705882,48133.043254,13553.0,77194.0,99876.0,129875.0,188560.0,17.0,87208.352941,70447.093148,6082.0,47154.0,64337.0,123509.0,280981.0


In [66]:
byLanType.describe().loc['English',['subs','views']]

Unnamed: 0_level_0,subs,subs,subs,subs,subs,subs,subs,subs,views,views,views,views,views,views,views,views
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,std,min,25%,50%,75%,max
Type,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2
Corperate,38.0,100731.263158,51007.076046,12722.0,66188.75,99043.0,121197.75,213123.0,38.0,77702.026316,53722.469991,3781.0,33335.75,78049.5,108813.75,216185.0
Goverment,99.0,99979.171717,47252.836589,1.0,66997.5,96216.0,139444.5,196079.0,99.0,70682.707071,51772.829966,0.0,30014.0,60742.0,103017.5,212767.0
Individual,179.0,100489.385475,47097.652594,1918.0,63322.5,97116.0,137661.0,215402.0,179.0,72584.324022,61462.828792,367.0,20098.5,59022.0,103556.0,266672.0
NGO,147.0,91815.673469,47077.074768,1.0,57658.5,92360.0,129148.5,201254.0,147.0,62068.544218,53603.487108,0.0,17991.5,45258.0,102698.0,237781.0


Manipulation

In [67]:
df['subpervideo'] = df['subs']/df['nvideos']
df['subpervideo'].describe()

count      1000.000000
mean       5323.918235
std       12452.576225
min           0.010309
25%        1213.776703
50%        2076.065934
75%        4187.259317
max      160787.000000
Name: subpervideo, dtype: float64

In [68]:
df['viewspervideo'] = df['views']/df['nvideos']
df['viewspervideo'].describe()

count      1000.000000
mean       3919.555840
std       11023.600920
min           0.000000
25%         565.376586
50%        1349.142473
75%        3054.051637
max      163042.000000
Name: viewspervideo, dtype: float64

In [69]:
df['viewspersub'] = df['views']/df['subs']
df['viewspersub'].describe()

count    1000.000000
mean        6.178630
std        59.464689
min         0.000000
25%         0.354516
50%         0.724784
75%         1.094784
max      1059.000000
Name: viewspersub, dtype: float64

In [70]:
df.head(10)

Unnamed: 0,ChannelID,subs,nvideos,views,Category,Language,Type,subpervideo,viewspervideo,viewspersub
Y0000,Y0000,122460,93,18074,music,Russia,NGO,1316.774194,194.344086,0.147591
Y0001,Y0001,64265,41,17181,news,Chinese,Goverment,1567.439024,419.04878,0.267346
Y0002,Y0002,85150,40,110777,food,English,NGO,2128.75,2769.425,1.300963
Y0003,Y0003,49281,13,28008,music,Chinese,Individual,3790.846154,2154.461538,0.568333
Y0004,Y0004,113275,50,111935,news,Spanish,NGO,2265.5,2238.7,0.98817
Y0005,Y0005,68234,95,31964,news,Spanish,NGO,718.252632,336.463158,0.468447
Y0006,Y0006,57018,7,48303,news,English,Corperate,8145.428571,6900.428571,0.847154
Y0007,Y0007,122158,96,110287,travel,English,NGO,1272.479167,1148.822917,0.902823
Y0008,Y0008,51753,28,26484,food,Chinese,Individual,1848.321429,945.857143,0.511738
Y0009,Y0009,186979,65,190713,music,English,Goverment,2876.6,2934.046154,1.01997
