In [12]:
from utils import load_csv_data
import datasets
import torch, spacy
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import classification_report

In [4]:
# this part is the same as in "calculating_embeddings" to load the datasets

# get device proper device to run on
if torch.cuda.is_available():
    device = "cuda"
else:
    # calculating the embeddings on a cpu can take hours, not recommended
    device = "cpu"
all_records_df = load_csv_data("../data/all_records_csvs")

# find and delete null values
print("counting null values")
print(all_records_df.isnull().sum())

all_records_df.dropna(axis=0, inplace=True)

# also we want a df of individual videos that are identified by title and description
videos_df = all_records_df.drop_duplicates(["title", "description"], keep="last")
# note: if you want the first record in videos_df, enter vidoes_df.iloc[0] not videos_df[0]
# as the later will try to return the record that has index or id = 0  wich is the index in the all_records_df

counting null values
title                             1
views                             0
time-stamp-upload-milliseconds    0
time-stamp                        0
date-time-hr                      0
upload-time-hr                    0
upload-time-stamp                 0
description                       0
video-length                      0
video-length-milliseconds         0
channel                           0
dtype: int64


In [5]:
print(videos_df)

                                                    title   views   
28      Twenty years on John Bolton is still defending...   20438  \
29      Paul Bremer on Iraq occupation mistakes and ac...    2829   
88      Indian opposition leader Rahul Gandhi disquali...   29224   
89      India’s Rahul Gandhi disqualified from parliament   64930   
179     TikTok CEO Shou Zi Chew faces off with US legi...    7052   
...                                                   ...     ...   
228295  How does Bill Clinton's tapes-in-sock-drawer c...  218547   
228296  Judge Jeanine: Biden knows he has a lot to hid...  146345   
228297  What about the 17 recordings implicating Presi...   41617   
228298  Jesse Watters: Biden banned a trans flasher fr...  438730   
228299    JD Vance: The DOJ going after Trump is ‘insane’   37934   

        time-stamp-upload-milliseconds     time-stamp   
28                            87102000  1679754890966  \
29                            87119000  1679754890967   


# Labelling dataset for supervised learning
- There are no labels provided in the dataset, nevertheless, we would like to try out supervised learning.
- 10% of the videos are randomly selected below for training.

In [6]:
# randomly select 10% of the dataset and save as csv file
videos_df.sample(n=round(videos_df.shape[0]*0.1), random_state = 100).to_csv(r'..\data\supervised_training.csv', index=False)


- Now the labels have to be done manually and the labelled data is saved in another csv file as `..\data\supervised_training_labelled.csv`.
- After that, this labelled file is read as a dataframe again.
- This labelled data is split into train set (80%) and test set (20%).

In [9]:
# get the labelled data for training
df = pd.read_csv('..\data\supervised_training_labelled.csv')

# split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=123)


# Features
- 2 features are implemented here: TF-IDF and Bag-of-Words.

In [10]:
# feature extraction using TF-IDF
vectorizer_TfIdf = TfidfVectorizer()
X_train_TfIdf = vectorizer_TfIdf.fit_transform(train_df['title'] + ' ' + train_df['description'])
Y_train_TfIdf = train_df['topic']
X_test_TfIdf = vectorizer_TfIdf.transform(test_df['title'] + ' ' + test_df['description'])
Y_test_TfIdf = test_df['topic']

In [13]:
# feature extraction using Bag-of-Words (BoW)
vectorizer_BoW = CountVectorizer()
X_train_BoW = vectorizer_BoW.fit_transform(train_df['title'] + ' ' + train_df['description'])
Y_train_BoW = train_df['topic']
X_test_BoW = vectorizer_BoW.transform(test_df['title'] + ' ' + test_df['description'])
Y_test_BoW = test_df['topic']

# Learning
- SVM is applied separately on TF-IDF and BoW.
- The results are then compared.

In [14]:
# apply Support Vector Machine (SVM) Classifier
# on TF-IDF
svm_classifier_TfIdf = SVC()
svm_classifier_TfIdf.fit(X_train_TfIdf, Y_train_TfIdf)
svm_predictions_TfIdf = svm_classifier_TfIdf.predict(X_test_TfIdf)

print('SVM Classifier Results using TF-IDF:')
print(classification_report(Y_test_TfIdf, svm_predictions_TfIdf))
print()

# on Bag-of-Words
svm_classifier_BoW = SVC()
svm_classifier_BoW.fit(X_train_BoW, Y_train_BoW)
svm_predictions_BoW = svm_classifier_BoW.predict(X_test_BoW)

print('SVM Classifier Results using Bag-of-Words:')
print(classification_report(Y_test_BoW, svm_predictions_BoW))
print()


SVM Classifier Results using TF-IDF:
                      precision    recall  f1-score   support

                   ?       0.00      0.00      0.00         1
            Business       0.00      0.00      0.00         7
   Culture & Leisure       0.00      0.00      0.00         4
           Editorial       0.86      0.67      0.75        18
         Environment       0.00      0.00      0.00         3
              Health       0.00      0.00      0.00         1
       International       0.48      0.78      0.60        46
               Local       0.73      0.36      0.48        22
            Politics       0.59      0.76      0.67        46
Science & Technology       0.00      0.00      0.00         9
              Sports       0.00      0.00      0.00         2

            accuracy                           0.57       159
           macro avg       0.24      0.23      0.23       159
        weighted avg       0.51      0.57      0.52       159




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


SVM Classifier Results using Bag-of-Words:
                      precision    recall  f1-score   support

                   ?       0.00      0.00      0.00         1
            Business       0.00      0.00      0.00         7
   Culture & Leisure       0.00      0.00      0.00         4
           Editorial       0.74      0.78      0.76        18
         Environment       0.00      0.00      0.00         3
              Health       0.00      0.00      0.00         1
       International       0.51      0.76      0.61        46
               Local       0.50      0.32      0.39        22
            Politics       0.64      0.80      0.71        46
Science & Technology       0.00      0.00      0.00         9
              Sports       0.00      0.00      0.00         2

            accuracy                           0.58       159
           macro avg       0.22      0.24      0.22       159
        weighted avg       0.49      0.58      0.52       159




  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


- TF-IDF has an accuracy of 57% whereas BoW 58%. So these two features performed similarly and not well on the dataset.

# Clustering on the entire dataset
- Although the performance metrics are not good, they are still applied to our dataset.
- Classes labelled by SVM, using TF-IDF and BoW separately, are stored into the dataframe and also a csv file.

In [16]:
# extract features from the entire dataset
X_allVideos_TfIdf = vectorizer_TfIdf.transform(videos_df['title'] + ' ' + videos_df['description'])
X_allVideos_BoW = vectorizer_BoW.transform(videos_df['title'] + ' ' + videos_df['description'])

# store the classes into new columns
videos_df['topic(TF-IDF)'] = svm_classifier_TfIdf.predict(X_allVideos_TfIdf)
videos_df['topic(BoW)'] = svm_classifier_BoW.predict(X_allVideos_BoW)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  videos_df['topic(TF-IDF)'] = svm_classifier_TfIdf.predict(X_allVideos_TfIdf)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  videos_df['topic(BoW)'] = svm_classifier_BoW.predict(X_allVideos_BoW)


In [17]:
# export to csv file
videos_df.to_csv(r'..\data\supervised_results.csv', index=False)