In [1]:
import pandas as pd
import numpy as np
import re
import nltk
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess
import torch.optim as optim
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, f1_score

In [2]:
df = pd.read_csv('C:\\Users\cheek\\ML-7641-Team14\\dataset\\output\\combined.csv')

In [3]:
df.drop_duplicates(subset='title', keep='first', inplace=True)

In [4]:
df.columns

Index(['video_id', 'title', 'publishedAt', 'channelId', 'channelTitle',
       'categoryId', 'trending_date', 'tags', 'views per day', 'likes per day',
       'dislikes per day', 'comments per day', 'thumbnail_link', 'rating',
       'description', 'trending'],
      dtype='object')

In [5]:
df.drop(['publishedAt', 'trending_date', 'views per day', 'likes per day', 
       'dislikes per day', 'comments per day', 'thumbnail_link', 'rating',], axis=1, inplace=True)

In [6]:
df.drop('video_id', axis=1, inplace=True)

In [7]:
df.columns

Index(['title', 'channelId', 'channelTitle', 'categoryId', 'tags',
       'description', 'trending'],
      dtype='object')

In [8]:
#nltk.download('popular', download_dir='C:\\Users\cheek\\ML-7641-Team14\\shravan\\nltk_data')

In [9]:
nltk.data.path.append('C:/Users/cheek/ML-7641-Team14/shravan/nltk_data')

In [10]:
stopwords = nltk.corpus.stopwords.words('english')

In [11]:
def clean_text(text):
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    text = text.lower()
    words = nltk.word_tokenize(text)
    filtered = []
    for word in words:
        if word not in stopwords:
            filtered.append(word)
    text = ' '.join(filtered)
    return text

In [12]:
df['lang'] = df['title'] + ' ' + df['tags'] + ' ' + df['description']

In [13]:
try:
    df['lang'] = df['lang'].apply(lambda x: clean_text(x) if isinstance(x, (str, bytes)) else x)
except LookupError as e:
    print(f"Error occurred: {e}")


In [14]:
from collections import Counter
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from typing import List

In [15]:
df = df.dropna(subset=['lang'])


In [16]:
sentences = [text.split() for text in df['lang']]
model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

df['embeddings'] = df['lang'].apply(lambda x: [model.wv[word] for word in x.split() if word in model.wv])
df['embeddings'] = df['embeddings'].apply(lambda x: ' '.join([' '.join(map(str, arr)) for arr in x]))

In [17]:
class Net(nn.Module):
    def __init__(self, embedding_dim):
        super(Net, self).__init__()
        self.embedding_dim = embedding_dim
        self.embedding = nn.Embedding.from_pretrained(torch.Tensor(model.wv.vectors))
        self.fc1 = nn.Linear(embedding_dim, 32)
        self.fc2 = nn.Linear(32, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        embedded = self.embedding(x).mean(dim=1)
        out = self.fc1(embedded)
        out = self.fc2(out)
        out = self.sigmoid(out)
        return out

In [18]:
num_epochs = 10
batch_size = 32
learning_rate = 0.01

In [22]:
X = df['embeddings'].tolist()
y = df['trending'].tolist()
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
# X_train = [torch.tensor(x) for x in X_train]
# y_train = torch.tensor(y_train).unsqueeze(1).float()
# X_val = [torch.tensor(x) for x in X_val]
# y_val = torch.tensor(y_val).unsqueeze(1).float()
# train_dataset = [(x, y) for x, y in zip(X_train, y_train)]
# train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
# val_dataset = [(x, y) for x, y in zip(X_val, y_val)]
# val_loader = DataLoader(val_dataset, batch_size=batch_size)


In [20]:
pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('classifier', SVC(kernel='linear', C=1.0))
])

pipeline.fit(X_train, y_train)

y_pred = pipeline.predict(X_val)

accuracy = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred, average='weighted')

print('Accuracy:', accuracy)
print('F1 Score:', f1)

Accuracy: 0.4838709677419355
F1 Score: 0.48323512551426223


In [21]:
# model = Net(model.vector_size)
# criterion = nn.BCELoss()
# optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# for epoch in range(num_epochs):
#     model.train()
#     train_loss = 0
#     for X_batch, y_batch in train_loader:
#         optimizer.zero_grad()
#         outputs = model(X_batch)
#         loss = criterion(outputs, y_batch)
#         loss.backward()
#         optimizer.step()
#         train_loss += loss.item() * X_batch.size(0)
#     train_loss /= len(train_loader.dataset)

#     model.eval()
#     val_loss = 0
#     val_acc = 0
#     with torch.no_grad():
#         for X_batch, y_batch in val_loader:
#             outputs = model(X_batch)
#             loss = criterion(outputs, y_batch)
#             val_loss += loss.item() * X_batch.size(0)
#             predictions = (outputs >= 0.5).float()
#             val_acc += torch.sum(predictions == y_batch).item()
#     val_loss /= len(val_loader.dataset)
#     val_acc /= len(val_loader.dataset)

#     print(f'Epoch {epoch+1}/{num_epochs}, train_loss: {train_loss:.4f}, val_loss: {val_loss:.4f}, val_acc: {val_acc:.4f}')