Data Collection

In [2]:
#install the google api client
!pip install google-api-python-client



In [3]:
#Import required libraries
from googleapiclient.discovery import build
import pandas as pd

In [4]:
API_KEY = "AIzaSyAdPWudQYO3JrK8JDMLoe2hTXHGpY6fRFA"
youtube = build('youtube','v3',developerKey=API_KEY)

#function to fetch comments from a Youtube video
def get_video_comments(video_id,max_comments=100):
  comments = []
  request = youtube.commentThreads().list(
      part = "snippet",
      videoId = video_id,
      maxResults = max_comments
  )
  response = request.execute()
  while response and len(comments) < max_comments:
    for item in response['items']:
      comment = item['snippet']['topLevelComment']['snippet']['textDisplay']
      comments.append(comment)

    if 'nextPageToken' in response:
      request = youtube.commentThreads().list(
          part = "snippet",
          videoId = video_id,
          pageToken = response['nextPageToken'],
          maxResults = max_comments
      )
      response = request.execute()
    else:
      break
  return comments[:max_comments]

In [5]:
#Example usage
video_id = "ZwviinaA7co"
comments = get_video_comments(video_id)

In [6]:
# Save comments to a CSV file
df = pd.DataFrame(comments, columns=['Comment'])
df.to_csv("youtube_comments.csv", index=False)
print(f"Saved {len(comments)} comments to youtube_comments.csv")


Saved 100 comments to youtube_comments.csv


Data Preprocessing

In [7]:
# Install necessary libraries
!pip install nltk spacy
!python -m spacy download en_core_web_sm

Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m29.0 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [8]:
import pandas as pd
import re
import spacy
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [9]:
nlp = spacy.load("en_core_web_sm")

In [10]:
df = pd.read_csv("youtube_comments.csv")

In [11]:
# Preprocessing functions
def clean_text(text):
    text = re.sub(r"http\S+", "", text)  # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)  # Remove non-alphabetic characters
    return text.lower()

def lemmatize_text(text):
    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc if not token.is_stop])


In [12]:
# Apply preprocessing
df['Cleaned_Comment'] = df['Comment'].apply(clean_text).apply(lemmatize_text)
df.to_csv("cleaned_comments.csv", index=False)
print("Preprocessed data saved to cleaned_comments.csv")


Preprocessed data saved to cleaned_comments.csv


Data Annotation (Sentiment Analysis)

In [13]:
# Install VADER sentiment library
!pip install vaderSentiment


Collecting vaderSentiment
  Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl.metadata (572 bytes)
Downloading vaderSentiment-3.3.2-py2.py3-none-any.whl (125 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m126.0/126.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: vaderSentiment
Successfully installed vaderSentiment-3.3.2


In [14]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer


In [15]:
analyzer = SentimentIntensityAnalyzer()


In [16]:
def get_sentiment(comment):
    score = analyzer.polarity_scores(comment)
    if score['compound'] > 0.05:
        return "Positive"
    elif score['compound'] < -0.05:
        return "Negative"
    else:
        return "Neutral"


In [17]:
# Annotate sentiments
df['Sentiment'] = df['Cleaned_Comment'].apply(get_sentiment)
df.to_csv("annotated_comments.csv", index=False)
print("Annotated data saved to annotated_comments.csv")


Annotated data saved to annotated_comments.csv


Model Training (ML)

In [18]:
# Install scikit-learn
!pip install scikit-learn




In [19]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report


In [20]:
df = pd.read_csv("annotated_comments.csv")


In [21]:
# Prepare data
X = df['Cleaned_Comment']
y = df['Sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [22]:
# Check for NaN values in the dataset
print(X_train.isna().sum())  # This will print the number of NaN values in each column


3


In [23]:
# Remove rows with NaN values
X_train = X_train.dropna()
y_train = y_train[X_train.index]
X_test = X_test.dropna()
y_test = y_test[X_test.index]


In [24]:
print(X_train.isna().sum())

0


In [25]:
# Vectorize text using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)


In [26]:
# Train a Random Forest model
model = RandomForestClassifier()
model.fit(X_train_tfidf, y_train)


In [27]:
# Evaluate model
y_pred = model.predict(X_test_tfidf)
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

    Negative       1.00      0.56      0.71         9
     Neutral       0.67      1.00      0.80        10
    Positive       0.00      0.00      0.00         1

    accuracy                           0.75        20
   macro avg       0.56      0.52      0.50        20
weighted avg       0.78      0.75      0.72        20



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [28]:
# Evaluate model accuracy, precision, recall, F1-score
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.75
