## Load model and vectorizer

In [10]:
import os
import mlflow
import dagshub
from dotenv import load_dotenv
load_dotenv()

True

In [3]:
dagshub_token = os.getenv("DAGSHUB_PAT")
if not dagshub_token:
    raise EnvironmentError("DAGSHUB_PAT environment variable is not set")
os.environ["MLFLOW_TRACKING_USERNAME"] = dagshub_token
os.environ["MLFLOW_TRACKING_PASSWORD"] = dagshub_token

dagshub_url = "https://dagshub.com"
repo_owner = "Pranay5519"
repo_name = "yt-comment-sentiment-analysis"

# Set up MLflow tracking URI
mlflow.set_tracking_uri(f'{dagshub_url}/{repo_owner}/{repo_name}.mlflow')

In [5]:
model_uri = f"models:/yt_chrome_plugin_model/staging"
model = mlflow.pyfunc.load_model(model_uri)

 - mlflow (current: 3.8.1, required: mlflow==2.16.2)
 - cloudpickle (current: 3.1.2, required: cloudpickle==3.1.1)
 - matplotlib (current: 3.10.8, required: matplotlib==3.10.5)
 - numpy (current: 2.4.1, required: numpy==2.3.2)
 - pandas (current: 2.3.3, required: pandas==2.3.1)
 - pyarrow (current: 22.0.0, required: pyarrow==17.0.0)
 - scikit-learn (current: 1.8.0, required: scikit-learn==1.7.1)
 - scipy (current: 1.17.0, required: scipy==1.16.1)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [6]:
model

mlflow.pyfunc.loaded_model:
  artifact_path: lgbm_model
  flavor: mlflow.sklearn
  run_id: 8f046e5f6ad640859a094ad43af9bd8b

In [21]:
import joblib
tfidf_vectorizer = joblib.load(r"D:\yt-comment-sentiment-analysis2\tfidf_vectorizer.pkl")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


## Load Comments Function

In [4]:

import os
import joblib
import mlflow
import dagshub
from mlflow.tracking import MlflowClient
from dotenv import load_dotenv
load_dotenv()
def load_model_and_vectorizer(model_name, model_version, vectorizer_path):
    dagshub_token = os.getenv("DAGSHUB_PAT")
    if not dagshub_token:
        raise EnvironmentError("DAGSHUB_PAT environment variable is not set")

    os.environ["MLFLOW_TRACKING_USERNAME"] = dagshub_token
    os.environ["MLFLOW_TRACKING_PASSWORD"] = dagshub_token

    dagshub_url = "https://dagshub.com"
    repo_owner = "Pranay5519"
    repo_name = "yt-comment-sentiment-analysis"

    # Set up MLflow tracking URI
    mlflow.set_tracking_uri(f'{dagshub_url}/{repo_owner}/{repo_name}.mlflow')
    client = MlflowClient()
    model_uri = f"models:/{model_name}/{model_version}"
    model = mlflow.pyfunc.load_model(model_uri)
    vectorizer = joblib.load(vectorizer_path)  # Load the vectorizer
    return model, vectorizer
# Initialize the model and vectorizer
vec_path = r"D:\yt-comment-sentiment-analysis2\tfidf_vectorizer.pkl"
model, vectorizer = load_model_and_vectorizer("yt_chrome_plugin_model", "2", vec_path)  # Update paths and versions as needed


Downloading artifacts: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 7/7 [00:04<00:00,  1.44it/s]
 - mlflow (current: 3.8.1, required: mlflow==2.15.0)
 - cffi (current: 2.0.0, required: cffi==1.16.0)
 - cloudpickle (current: 3.1.2, required: cloudpickle==3.0.0)
 - lightgbm (current: 4.6.0, required: lightgbm==4.5.0)
 - matplotlib (current: 3.10.8, required: matplotlib==3.9.1)
 - numpy (current: 2.4.1, required: numpy==1.26.4)
 - pandas (current: 2.3.3, required: pandas==2.2.2)
 - psutil (current: 7.2.1, required: psutil==6.0.0)
 - pyarrow (current: 22.0.0, required: pyarrow==15.0.2)
 - scikit-learn (current: 1.8.0, required: scikit-learn==1.5.1)
 - scipy (current: 1.17.0, required: scipy==1.14.0)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model

In [5]:
import requests

def fetch_comments(video_id: str, api_key: str, max_comments: int = 500):
    comments = []
    page_token = ""

    try:
        while len(comments) < max_comments:
            url = "https://www.googleapis.com/youtube/v3/commentThreads"
            params = {
                "part": "snippet",
                "videoId": video_id,
                "maxResults": 100,
                "pageToken": page_token,
                "key": api_key
            }

            response = requests.get(url, params=params)
            response.raise_for_status()
            data = response.json()

            if "items" in data:
                for item in data["items"]:
                    snippet = item["snippet"]["topLevelComment"]["snippet"]

                    comment_text = snippet.get("textOriginal")
                    timestamp = snippet.get("publishedAt")
                    author_id = (
                        snippet.get("authorChannelId", {}).get("value", "Unknown")
                    )

                    comments.append({
                        "text": comment_text,
                        "timestamp": timestamp,
                        "authorId": author_id
                    })

                    if len(comments) >= max_comments:
                        break

            page_token = data.get("nextPageToken")
            if not page_token:
                break

    except requests.exceptions.RequestException as e:
        print("Error fetching comments:", e)

    return comments


In [6]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
def preprocess_comment(comment):
    """Apply preprocessing transformations to a comment."""
    try:
        # Convert to lowercase
        comment = comment.lower()

        # Remove trailing and leading whitespaces
        comment = comment.strip()

        # Remove newline characters
        comment = re.sub(r'\n', ' ', comment)

        # Remove non-alphanumeric characters, except punctuation
        comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

        # Remove stopwords but retain important ones for sentiment analysis
        stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
        comment = ' '.join([word for word in comment.split() if word not in stop_words])

        # Lemmatize the words
        lemmatizer = WordNetLemmatizer()
        comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

        return comment
    except Exception as e:
        print(f"Error in preprocessing comment: {e}")
        return comment

In [8]:
API_KEY = os.getenv("API_KEY")
VIDEO_ID = "RmwbNdyrilk"

comments = fetch_comments(VIDEO_ID, API_KEY)
print(len(comments))
print(comments[0])


96
{'text': 'Thank you for watching. Please click the subscribe tab and then the "bell" icon to subscribe to our channel here on YouTube and get notified when new content is posted... And thank you for your interest in science!   -- Andrew', 'timestamp': '2026-01-15T13:23:20Z', 'authorId': 'UC2D2CMWXMOVWx7giW1n3LIg'}


In [9]:
import pandas as pd

def predict_sentiment(comments):
    if not comments:
        return {"error": "No comments provided"}

    try:
        # 1. Preprocess comments
        preprocessed_comments = [
            preprocess_comment(comment["text"]) for comment in comments
        ]

        # 2. Vectorize comments (sparse matrix)
        transformed_comments = vectorizer.transform(preprocessed_comments)

        # 3. Get expected schema columns from MLflow model
        input_schema = model.metadata.get_input_schema()
        expected_columns = input_schema.input_names()

        # 4. Convert sparse matrix to DataFrame
        feature_names = vectorizer.get_feature_names_out()
        df = pd.DataFrame(
            transformed_comments.toarray(),
            columns=feature_names
        )

        # 5. Add missing expected columns AT ONCE (fixes fragmentation)
        missing_cols = set(expected_columns) - set(df.columns)
        if missing_cols:
            missing_df = pd.DataFrame(
                0.0,
                index=df.index,
                columns=list(missing_cols)
            )
            df = pd.concat([df, missing_df], axis=1)

        # 6. Reorder columns exactly as model expects
        df = df[expected_columns]

        # 7. Predict
        predictions = model.predict(df).astype(str).tolist()

    except Exception as e:
        return {"error": str(e)}

    # 8. Build response
    return [
        {"comment": comment, "sentiment": sentiment}
        for comment, sentiment in zip(comments, predictions)
    ]
import pandas as pd

def predict_with_timestamps(comments_data):
    if not comments_data:
        return {"error": "No comments provided"}

    try:
        # 1. Extract text and timestamps
        comments = [item["text"] for item in comments_data]
        timestamps = [item["timestamp"] for item in comments_data]

        # 2. Preprocess comments
        preprocessed_comments = [
            preprocess_comment(comment) for comment in comments
        ]

        # 3. Vectorize comments
        transformed_comments = vectorizer.transform(preprocessed_comments)

        # 4. Get expected schema columns from MLflow model
        input_schema = model.metadata.get_input_schema()
        expected_columns = input_schema.input_names()

        # 5. Convert sparse matrix to DataFrame
        feature_names = vectorizer.get_feature_names_out()
        df = pd.DataFrame(
            transformed_comments.toarray(),
            columns=feature_names
        )

        # 6. Add missing columns in ONE operation (fixes warning)
        missing_cols = set(expected_columns) - set(df.columns)
        if missing_cols:
            missing_df = pd.DataFrame(
                0.0,
                index=df.index,
                columns=list(missing_cols)
            )
            df = pd.concat([df, missing_df], axis=1)

        # 7. Reorder columns exactly as model expects
        df = df[expected_columns]

        # 8. Predict
        predictions = model.predict(df).astype(str).tolist()

    except Exception as e:
        return {"error": f"Prediction failed: {str(e)}"}

    # 9. Build response with timestamps
    return [
        {
            "comment": comment,
            "sentiment": sentiment,
            "timestamp": timestamp
        }
        for comment, sentiment, timestamp in zip(
            comments, predictions, timestamps
        )
    ]


In [11]:
predict_sentiment(comments=comments)

[{'comment': {'text': 'Thank you for watching. Please click the subscribe tab and then the "bell" icon to subscribe to our channel here on YouTube and get notified when new content is posted... And thank you for your interest in science!   -- Andrew',
   'timestamp': '2026-01-15T13:23:20Z',
   'authorId': 'UC2D2CMWXMOVWx7giW1n3LIg'},
  'sentiment': '1'},
 {'comment': {'text': 'To make a better world‚úåüèº',
   'timestamp': '2026-01-18T05:46:32Z',
   'authorId': 'UCD9dMGvb-NXnTl-qLge4vTA'},
  'sentiment': '1'},
 {'comment': {'text': 'Can we get a collaboration with cam hanes and goggins with other ultra triathlon/marathon athletes and other high intense athletes and have Dr. Andrew Huberman, Dr. Wendy Suzuki, and Robert Sapolsky dissect the correlation between exercise, will power, and building new neural processes in the brain and how they interact with each other?',
   'timestamp': '2026-01-18T04:34:25Z',
   'authorId': 'UC217yaW7WVxsDdaXeQPDIhg'},
  'sentiment': '1'},
 {'comment': {

In [None]:
import pandas as pd
transformed_comments = tfidf_vectorizer.transform([comment])
input_schema = model.metadata.get_input_schema()
expected_columns = input_schema.input_names()

# Convert sparse matrix to DataFrame with vectorizer features
feature_names = tfidf_vectorizer.get_feature_names_out()
df = pd.DataFrame(transformed_comments.toarray(), columns=feature_names)
print(df.shape)
 # Add missing expected columns with zeros
for col in expected_columns:
    if col not in df.columns:
        df[col] = 0.0
# Reorder columns exactly as model expects
df = df[expected_columns]
predictions = model.predict(df).tolist()
predictions = [str(pred) for pred in predictions]
predictions

(1, 10000)


['1']

In [37]:
print(transformed_comments)

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (1, 10000)>
  Coords	Values
  (0, 575)	0.38752454417944343
  (0, 2715)	0.33319391100353063
  (0, 2857)	0.3886141199705799
  (0, 5231)	0.25132656002531284
  (0, 5485)	0.4152417565499547
  (0, 6732)	0.41003993171670844
  (0, 6929)	0.35366413515959577
  (0, 8319)	0.2428657835737687
