In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# read csv
df_CA = pd.read_csv("archive/CAvideos.csv")
df_US = pd.read_csv("archive/USvideos.csv")
df_GB = pd.read_csv("archive/GBvideos.csv")

In [3]:
df_CA

In [4]:
print(list(df_CA.columns.values))

In [5]:
print(len(df_CA), len(df_US), len(df_GB))

In [6]:
df = pd.concat([df_CA , df_US, df_GB])
len(df)

In [7]:
# only keep ['video_id', 'title', 'channel_title', 'category_id', 'description']
df = df[['video_id', 'title', 'category_id', 'tags','description']]

In [8]:
a = df['video_id'].isna().sum()
b = df['title'].isna().sum()
c = df['category_id'].isna().sum()
d = df['tags'].isna().sum()
e = df['description'].isna().sum()

print(a, b, c, d, e)

In [9]:
df = df.dropna()
len(df)

Identify a dataset to study, and perform an exploratory analysis of the data. Describe the dataset,
including its basic statistics and properties, and report any interesting findings. This exploratory analysis
should motivate the design of your model in the following sections

In [10]:
df['category_id'].plot.hist(bins = 60, figsize=(8,5))
plt.xlabel('Category IDs')
plt.ylabel('Frequency')
plt.title('Distribution of Category IDs')

In [11]:
cat_list = df['category_id'].to_list()

In [12]:
from collections import Counter

In [13]:
cat_counter = Counter(cat_list)
len(cat_counter.keys())

In [14]:
cat_counter.most_common(len(cat_counter.keys()))

In [15]:
# unique vals
cat_counter.keys()

In [16]:
print((cat_counter.most_common(1))[0][1]/len(cat_list))
print((cat_counter.most_common(2))[1][1]/len(cat_list))

In [17]:
cats = df['category_id'].value_counts()
cat_chart = [cats]


my_labels = [str(key) for key in cat_counter.keys()]

plt.pie(cat_chart, labels=my_labels)

plt.title('Youtube Categories')
plt.axis('equal')

plt.show()

In [18]:
# getting rid of all ids after 17, the top 10
top_cats = cat_counter.keys()
top_cats = list(top_cats)[0:10]
top_cats = set(top_cats)
top_cats

In [19]:
# df containing only the top 10
new_df = pd.DataFrame()

for cat in top_cats:
    new_df = pd.concat([new_df , df[df['category_id'] == cat] ]) 

In [20]:
len(new_df)

In [21]:
cat_list = new_df['category_id'].to_list()
cat_counter = Counter(cat_list)
cat_counter.most_common(len(cat_counter.keys()))

**Model Testing**

In [22]:
import gzip
from collections import defaultdict
import random
import string
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer

In [23]:
# the data to work with
df_all = new_df

In [24]:
df_all.set_index('video_id', inplace=True)

In [25]:
df_all

In [26]:
df_all.iloc[0]

In [27]:
y_data = df_all["category_id"].tolist()

In [28]:
df = df_all.drop('category_id', axis=1)

In [29]:
df

**Transformed Text**

In [30]:
import re
def preprocess_text(text_data_arr):
    
#     for i,data in enumerate(text_data_arr):
#         text_data_arr[i] = re.sub(r"http\S+", "", data, flags=re.MULTILINE)

    tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')
    
    features = tfidf.fit_transform(text_data_arr)
    
    return features, tfidf

In [31]:
X_train, X_test, y_train, y_test = train_test_split(df['description'].tolist(), y_data, test_size=0.20, train_size = 0.8, random_state=42)

In [32]:
X_train[0]

In [33]:
X_train, tfidf = preprocess_text(X_train)

In [34]:
X_test = tfidf.transform(X_test)

In [35]:
X_train.shape, X_test.shape

((89008, 279440), (22252, 279440))

In [36]:
#Calculate accuracy
def val_accuracy(predictions, truths):
    correct = 0
    for prediction, actual in zip(predictions, truths):
        if prediction == actual:
            correct += 1
    return correct/len(truths)

**Logistic Regression**

In [37]:
from sklearn.linear_model import LogisticRegression

In [38]:
lr_model = LogisticRegression(random_state=0)
lr_model.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=0, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [39]:
predictions = lr_model.predict(X_test)

val_acc = val_accuracy(predictions, y_test)
print("Val Acc = {}".format(val_acc))

Val Acc = 0.9170411648391156


**Support Vector Machine (SVM)**

In [40]:
from sklearn.svm import LinearSVC, SVC
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from scipy.stats import reciprocal, uniform

In [41]:
grid = LinearSVC(C=0.3)
grid.fit(X_train, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1, loss='squared_hinge', max_iter=1000,
          multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
          verbose=0)

In [42]:
predictions = grid.predict(X_test)
val_acc = val_accuracy(predictions, y_test)
print("Val Acc = {}".format(val_acc))

Val Acc = 0.9621607046557613


**Random Forest**

In [43]:
from sklearn.ensemble import RandomForestClassifier

In [44]:
rfc_model = RandomForestClassifier(random_state=0)
rfc_model.fit(X_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [45]:
predictions = rfc_model.predict(X_test)

val_acc = val_accuracy(predictions, y_test)
print("Val Acc = {}".format(val_acc))

Val Acc = 0.8757864461621427


**Custom Testing**

In [46]:
def predict(model, tfidf, entry):
    Z_test = tfidf.transform([entry])
    return model.predict(Z_test)[0]

In [47]:
df_all.iloc[80890]

title                             SMALLFOOT - Official Trailer 1
category_id                                                   24
tags           smallfoot|"smallfoot trailer"|"zendaya"|"chann...
description    Only In Theaters September 28, 2018--www.small...
Name: 34cHO5_LX9g, dtype: object

In [48]:
predict(grid, tfidf, df_all.iloc[80890]['description'])

24

In [49]:
df_all.iloc[11111]

title          Pitbull, Fifth Harmony - Por Favor (Official V...
category_id                                                   10
tags           Becky G|"Daddy Yankee"|"Ozuna"|"Wisin"|"Fifth ...
description    Listen now @ all your favorite platforms click...
Name: uZ1M3DkKCEI, dtype: object

In [50]:
predict(grid, tfidf, df_all.iloc[11111]['description'])

10