In [None]:
# https://www.kaggle.com/rsrishav/youtube-trending-video-dataset

import numpy as np
import pandas as pd

import os

In [None]:
data = []
for dirname, _, filenames in os.walk('./archive1/'):
    for filename in filenames:
        if filename.endswith('csv'):
            dft = pd.DataFrame(pd.read_csv(os.path.join(dirname, filename), header=0))
            dft['country'] = filename[:2]
            data.append(dft)

df = pd.concat(data, axis=0, ignore_index=True)

In [None]:
country = df.country.unique().tolist()
df['countryId'] = df.country.apply(lambda x: country.index(x))

In [None]:
country

In [None]:
df.head(2)

In [None]:
df.describe()

In [None]:
columns = list(df.head(0))
columns

In [None]:
# Finding duplicate values if any
def eda(dfA, all=False, desc='EDA '):
    print(desc)
    print(f'\nShape:\n{dfA.shape}')
    print(f'\nIs Null: {dfA.isnull().sum().sum()}')
    print(f'{dfA.isnull().mean().sort_values(ascending=False)}')
    dup = dfA.duplicated()
    print(f'\nDuplicated: \n{dfA[dup].shape}\n')
    try:
        print(dfA[dfA.duplicated(keep=False)].sample(4))
    except:
        pass
    if all:  
        
        print(f'\nDTypes - Numerics')
        print(dfA.describe(include=[np.number]))
        print(f'\nDTypes - Categoricals')
        print(dfA.describe(include=['object']))
        
        #print(df.loc[:, df.dtypes=='object'].columns)
        print(f'\nHead:\n{dfA.head()}')
        print(f'\nSamples:\n{dfA.sample(2)}')
        print(f'\nTail:\n{dfA.tail()}')

In [None]:
eda(df)

In [None]:
len(df[df.description.isna()])

In [None]:
df.description = df.description.fillna('no-discription')

In [None]:
eda(df)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
g = df[['countryId', 'country']].groupby('country').count()
a = df[['likes', 'dislikes', 'view_count', 'country']].groupby('country').sum()

In [None]:
g

# Comparing India and US

In [None]:
country = df.query("country == 'IN' or country == 'US'")

In [None]:
# Separates dataset into category id and view count
def sepColumns(dataset):
    num = []
    cat = []
    for i in dataset.columns:
        if dataset[i].dtype == 'object':
            cat.append(i)
        else:
            num.append(i)
    return num, cat

In [None]:
num, categ = sepColumns(country)

In [None]:
country[num].describe()

In [None]:
country[['title', 'dislikes']].groupby('title').sum().sort_values(by='dislikes', ascending=False).head()

In [None]:
country[['title', 'likes']].groupby('title').sum().sort_values(by='likes', ascending=False).head()

In [None]:
country[['title', 'view_count']].groupby('title').sum().sort_values(by='view_count', ascending=False).head()

In [None]:
dashbr=country[['title', 'view_count', 'likes', 'dislikes']].groupby('title').sum().sort_values(by='view_count', ascending=False).head()
dashbr

In [None]:
country.sample()

In [None]:
def correlation(df, varT, xpoint=-0.5, showGraph=True):
    corr = df.corr()
    print(f'\nFeatures correlation:\n'
          f'Target: {varT}\n'
          f'Reference.: {xpoint}\n'
          f'\nMain features:')
    corrs = corr[varT]
    features = []
    for i in range(0, len(corrs)):
        if corrs[i] > xpoint and corrs.index[i] != varT:
            print(corrs.index[i], f'{corrs[i]:.2f}')
            features.append(corrs.index[i])
    if showGraph:
        sns.heatmap(corr,
                    annot=True, fmt='.2f', vmin=-1, vmax=1, linewidth=0.01,
                    linecolor='black', cmap='RdBu_r'
                    )
        plt.title('Correlations between features w/ target')
        plt.show()
    return features

In [None]:
varTarget = 'likes'

In [None]:
varFeatures = correlation(country, varTarget, 0.5)

In [None]:
def removeOutliers(out, varTarget):
    print('\nOutliers\nRemoving ...', end='')
    cidgrp = out[varTarget]
    print('..', end='')
    # quantiles
    qtl1 = cidgrp.quantile(.25)  
    qtl3 = cidgrp.quantile(.75)
    print('..', end='')
    # calculating iqr
    iqr = qtl3 - qtl1
    print('..', end='')

    # creating limits
    baixo = qtl1 - 1.5 * iqr
    alto = qtl3 + 1.5 * iqr
    print('..', end='')

    # removing outliers
    novodf = pd.DataFrame()
    print('..', end='')

    limites = out[varTarget].between(left=baixo, right=alto, inclusive=True)
    novodf = pd.concat([novodf, out[limites]])

    print('.....Done')

    return novodf

In [None]:
noOut = removeOutliers(country, varTarget)

In [None]:
# Two subplots
fig, (ax1, ax2) = plt.subplots(ncols=2, sharey=True, figsize=(15,5))
sns.boxplot(x=country[varTarget], ax=ax1).set_title('Original')
sns.boxplot(x=noOut[varTarget], ax=ax2).set_title('Original No outliers')

In [None]:
# Plotting dataset information
print(country[varTarget].describe())
sns.barplot(x=country[varTarget].describe().index[1:], y=country[varTarget].describe().values[1:])

In [None]:
print(noOut[varTarget].describe())
sns.barplot(x=noOut[varTarget].describe().index[1:], y=noOut[varTarget].describe().values[1:])

**Predictions**

In [None]:
varFeatures = correlation(noOut, varTarget, 0.5)

In [None]:
# ML Algorithms
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression, PoissonRegressor
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
from sklearn.dummy import DummyRegressor

# ML training and testing selection
from sklearn.model_selection import train_test_split
# Calculate the smallest absolute mean error between the 2 data presented
from sklearn.metrics import mean_absolute_error

In [None]:
regressors = [
#         DecisionTreeRegressor(),
#         RandomForestRegressor(),
        SVR(),
#         LinearRegression(),
#         GradientBoostingRegressor(),
#         PoissonRegressor(),
#         DummyRegressor(),
#         LogisticRegression(),
#         GaussianNB()
    ]

In [None]:
X = noOut[varFeatures]
y = noOut[varTarget]

In [None]:
type(X)

In [None]:
type(y)

In [None]:
y = pd.DataFrame(y)

In [None]:
y1 = y

In [None]:
type(y)

In [None]:
y = y['likes']

In [None]:
y

In [None]:
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.3, random_state=42)

In [None]:
Xtrain

In [None]:
ytrain

In [None]:
ytrain.shape

In [None]:
y1

In [None]:
type(y1)

In [None]:
y1.shape

In [None]:
y1.reset_index(drop=True)

In [None]:
y1

In [None]:
ytrain

In [None]:
Xtest

In [None]:
ytest

In [None]:
reg = []
mae = []
sco = []
for regressor in regressors:
    model = regressor
    model.fit(Xtrain, np.array(ytrain))
    sco.append(model.score(Xtrain, ytrain))
    previous = model.predict(Xtest)
    mae.append(round(mean_absolute_error(ytest, previous), 2))
    reg.append(regressor)

In [None]:
# Generating mean absolute error and score for the model
meuMae = pd.DataFrame(columns=['Regressor', 'mae', 'score'])
meuMae['Regressor'] = reg
meuMae['mae'] = mae
meuMae['score'] = sco

In [None]:
meuMae = meuMae.sort_values(by='score', ascending=False)
meuMae

In [None]:
meuMae["Regressor"].values[0]

In [None]:
model = meuMae["Regressor"].values[0]
x = noOut['view_count']
y = noOut[varTarget]
model.fit(np.array(x).reshape(-1, 1), y)

In [None]:
# what is the prediction to 1 million views?
valFeatures = [1000000]
predict = float(model.predict([valFeatures]))

In [None]:
print(f'Summary:\n'
          f'Regs analyzed: {len(noOut)}\n'
          f'ML applied: {meuMae["Regressor"].values[0]}\n'
          f'Features analyzed:')

print(f' - {varFeatures[0]}: {valFeatures[0]}')

print(f"Predicted likes: {predict:.0f} ")

In [None]:
noOut[noOut.view_count > 1000000][['view_count', 'likes']].describe()

In [None]:
go = noOut[['countryId', 'country']].groupby('country').count()
ao = noOut[['likes', 'dislikes', 'view_count', 'country']].groupby('country').sum()

In [None]:
sns.barplot(x=go.index, y=go.countryId).set_title('Number videos WW')

In [None]:
sns.barplot(x=ao.index, y=ao.view_count).set_title('Sum of Views by country')

In [None]:
ao.view_count

In [None]:
ao.index

In [None]:
sns.barplot(x=ao.index, y=ao.likes).set_title('Sum of likes by country')

In [None]:
sns.barplot(x=ao.index, y=ao.dislikes).set_title('Sum of dislikes by country')

In [None]:
ao.index,ao.dislikes