In [13]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
import re
import os
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [14]:
df=pd.read_csv("https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv")
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [15]:
df.isnull().sum()

Unnamed: 0,0
clean_comment,100
category,0


In [16]:
df.dropna(inplace=True)

In [17]:
df.drop_duplicates(inplace=True)

In [18]:
df=df[~(df['clean_comment']==' ')]

# Preprocessor Function
* remove ulrs
* remove unecessary stopwords
* remove `\n and \t`
* Apply lemitization



In [19]:
def preprocess(text):
  """
  This fun is responsible for apply all the preprocessing steps.
  """
  try:
    # convert the text into lower case and also remove the unecessary spaces
    text=text.lower().strip()

    # remove the urls
    text=re.sub(r'http[s]?://\S+|www\.\S+','',text)

    # Remove the \n and \t with ""
    text=text.replace("\n"," ").replace("\t"," ")

    # Now get the only

    # Remove the unecessry stopwords
    stopword=set(stopwords.words('english')) - set(["not","but",'because','not','never','no',"about"])

    tokens=[word for word in text.split() if word not in stopword]

    # Apply lemitization
    lemmatizer=WordNetLemmatizer()
    final_token=[lemmatizer.lemmatize(token) for token in tokens]

    return " ".join(final_token)
  except Exception as e:
    return str(e)

# Test the function

In [20]:
preprocess(df['clean_comment'][0].upper()+"https://github.com/Sami606713/youtube_comment_analysis/tree/main"+"\n\n\t\t\t")

'family mormon never tried explain still stare puzzled time time like kind strange creature nonetheless come admire patience calmness equanimity acceptance compassion developed thing buddhism teach'

In [21]:
df['clean_comment'][0].upper()+"https://github.com/Sami606713/youtube_comment_analysis/tree/main"+"\n\n\t\t\t"

' FAMILY MORMON HAVE NEVER TRIED EXPLAIN THEM THEY STILL STARE PUZZLED FROM TIME TIME LIKE SOME KIND STRANGE CREATURE NONETHELESS THEY HAVE COME ADMIRE FOR THE PATIENCE CALMNESS EQUANIMITY ACCEPTANCE AND COMPASSION HAVE DEVELOPED ALL THE THINGS BUDDHISM TEACHES https://github.com/Sami606713/youtube_comment_analysis/tree/main\n\n\t\t\t'

# Apply the function

In [22]:
%%time
df["final_comments"]=df['clean_comment'].apply(preprocess)

CPU times: user 13.8 s, sys: 1.03 s, total: 14.9 s
Wall time: 23.5 s


# Now connect the colab to mlflow with dagshub

In [None]:
!pip install mlflow dagshub

In [23]:
import mlflow
import dagshub
# connect to dagshub using dagshub token
dagshub_token="1052d14a61dff596b35e5ac72ec8bc5a57613415"
if dagshub_token:
    os.environ['MLFlow_TRACKING_USERNAME']=dagshub_token
    os.environ['MLFlow_TRACKING_PASSWORD']=dagshub_token
    # Set up the MLflow tracking URI with authentication using the token
    mlflow.set_tracking_uri(f'https://{dagshub_token}:@dagshub.com/Sami606713/youtube_comment_analysis.mlflow')

    print("DagsHub login successful!")
else:
    print("DagsHub token not found. Please set the DAGSHUB_TOKEN environment variable.")

DagsHub login successful!


# What we can log
- Train test split
- log the data
- convert the text into vector
- train the model
- log the model
- log the metrics

In [24]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score,classification_report

In [25]:
final_df=df[['final_comments','category']]
final_df.head()

Unnamed: 0,final_comments,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [90]:
# saperate feature and label
feature=final_df[['final_comments']]
target=final_df['category']

In [68]:
feature.head(2)

Unnamed: 0,final_comments
0,family mormon never tried explain still stare ...
1,buddhism much lot compatible christianity espe...


In [91]:
target=target.map({
    -1:0,
    0:1,
    1:2
})
target

Unnamed: 0,category
0,2
1,2
2,0
3,1
4,2
...,...
37244,1
37245,2
37246,1
37247,2


In [92]:
# target.value_counts()
# 0      1      2
# neg  neutral  pos

In [93]:
x_train,x_test,y_train,y_test=train_test_split(feature,target,test_size=0.2,random_state=42)

In [94]:
x_train.shape , y_train.shape

((29438, 1), (29438,))

In [65]:
# Build a pipeline for vectorize the data and train the model
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

In [66]:
# Build a transformer
transformer=ColumnTransformer(transformers=[
    ("Encode",TfidfVectorizer(ngram_range=(2, 2)),'final_comments')
]
,remainder="passthrough")
transformer

In [34]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [33]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [96]:
# Build a final pipeline
final=Pipeline(steps=[
    ("transformers",transformer),
    ("Model",XGBClassifier())
])
final

In [97]:
final.fit(x_train,y_train)

In [98]:
y_pred=final.predict(x_test)

In [99]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
confusion_mat = confusion_matrix(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"Confusion Matrix:\n{confusion_mat}")


Accuracy: 0.5023097826086956
Precision: 0.6209203401550892
Recall: 0.5023097826086956
Confusion Matrix:
[[ 184 1122  284]
 [   9 2494   55]
 [  87 2106 1019]]


In [100]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.66      0.12      0.20      1590
           1       0.44      0.97      0.60      2558
           2       0.75      0.32      0.45      3212

    accuracy                           0.50      7360
   macro avg       0.61      0.47      0.42      7360
weighted avg       0.62      0.50      0.45      7360



In [31]:
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
            xticklabels=['Negative', 'Neutral', 'Positive'],
            yticklabels=['Negative', 'Neutral', 'Positive'])
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# 0   1    2

# negative   neutral   positive

# Log all the things using mlflow

In [36]:
import io
from matplotlib.figure import Figure
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer

# Experiments
- In this stage we can perfrom different experiments.
    * `TF-IDF` vs `BOW`
    * `unigram` vs `bigram` vs `tri gram`
    *  `Max Features`

In [47]:
# Set experiment name
def perform_experiment(vectorizers: list, n_grams: list):
    mlflow.set_experiment("TF-IDF vs BOW")

    # Split the data into train and test set
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)

    # Now you can use MLflow to log parameters, metrics, and artifacts
    for vec in vectorizers:
        for n_gram in n_grams:
            # Build a transformer
            if vec == "TfidfVectorizer":
                transformer = ColumnTransformer(transformers=[
                    ("Encode", TfidfVectorizer(ngram_range=n_gram), 'final_comments')
                ], remainder="passthrough")
            else:
                transformer = ColumnTransformer(transformers=[
                    ("Encode", CountVectorizer(ngram_range=n_gram), 'final_comments')
                ], remainder="passthrough")

            run_name = f"LR-{vec}-{n_gram}"
            with mlflow.start_run(run_name=run_name):
                # Add the transformer and model
                final = Pipeline(steps=[
                    ("transformers", transformer),
                    ("Model", LogisticRegression())
                ])

                # Fit the pipeline
                final.fit(x_train, y_train)

                # Generate the prediction
                y_pred = final.predict(x_test)

                # Calculate the accuracy, precision, recall, confusion matrix
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                confusion_mat = confusion_matrix(y_test, y_pred)
                classification_rep = classification_report(y_test, y_pred, output_dict=True)

                # Log metrics on MLflow
                mlflow.log_metric("Accuracy", accuracy)
                mlflow.log_metric("Precision", precision)  # Fixed typo from 'Precession'
                mlflow.log_metric("Recall", recall)

                # Log detailed metrics from classification report
                for label, metrics in classification_rep.items():
                    if isinstance(metrics, dict):  # for each class or average type
                        for metric_name, metric_value in metrics.items():
                            mlflow.log_metric(f"{label} {metric_name}", metric_value)

                # Log the model
                model = final.named_steps['Model']
                mlflow.sklearn.log_model(model, "Model")

                # Log the parameters
                mlflow.log_params({"Vectorizer": vec, "N-gram": str(n_gram)})

                # Log the confusion matrix plot
                plt.figure(figsize=(8, 6))
                sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues',
                            xticklabels=['Negative', 'Neutral', 'Positive'],
                            yticklabels=['Negative', 'Neutral', 'Positive'])
                plt.xlabel('Predicted Labels')
                plt.ylabel('True Labels')
                plt.title('Confusion Matrix')

                # Save the plot to a file and log it
                plt.savefig("confusion_matrix.png")
                plt.close()
                mlflow.log_artifact("confusion_matrix.png")

                # Log the data as an artifact
                df.to_csv("reddit.csv", index=False)
                mlflow.log_artifact("reddit.csv")

In [49]:
vectorizer=['TfidfVectorizer','CountVectorizer']
n_grams=[(1,1),(1,2),(1,3)]
perform_experiment(vectorizers=vectorizer,n_grams=n_grams)

# Observation
- With model `LR` combination of `BOW` and `(1,1) n-gram` give best result.

# Experiment2 (max feature)

In [73]:
# Set experiment name
def perform_experiment2(max_feature:list):
    mlflow.set_experiment("CV-max_feature_experiment")
    # saperate feature and label
    feature=final_df[['final_comments']]
    target=final_df['category']
    # Split the data into train and test set
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)

    # Now you can use MLflow to log parameters, metrics, and artifacts
    for fea in max_feature:
            transformer = ColumnTransformer(transformers=[
                    ("Encode", CountVectorizer(ngram_range=(1,1),max_features=fea), 'final_comments')
                ], remainder="passthrough")

            run_name = f"LR-(CV)-{fea}"
            with mlflow.start_run(run_name=run_name):
                # Add the transformer and model
                final = Pipeline(steps=[
                    ("transformers", transformer),
                    ("Model", LogisticRegression())
                ])

                # Fit the pipeline
                final.fit(x_train, y_train)

                # Generate the prediction
                y_pred = final.predict(x_test)

                # Calculate the accuracy, precision, recall, confusion matrix
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                confusion_mat = confusion_matrix(y_test, y_pred)
                classification_rep = classification_report(y_test, y_pred, output_dict=True)

                # Log metrics on MLflow
                mlflow.log_metric("Accuracy", accuracy)
                mlflow.log_metric("Precision", precision)  # Fixed typo from 'Precession'
                mlflow.log_metric("Recall", recall)

                # Log detailed metrics from classification report
                for label, metrics in classification_rep.items():
                    if isinstance(metrics, dict):  # for each class or average type
                        for metric_name, metric_value in metrics.items():
                            mlflow.log_metric(f"{label} {metric_name}", metric_value)

                # Log the model
                model = final.named_steps['Model']
                mlflow.sklearn.log_model(model, "Model")

                # Log the parameters
                mlflow.log_params({"Vectorizer":"CV", "Max-Feature": str(fea)})

                # Log the confusion matrix plot
                plt.figure(figsize=(8, 6))
                sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues',
                            xticklabels=['Negative', 'Neutral', 'Positive'],
                            yticklabels=['Negative', 'Neutral', 'Positive'])
                plt.xlabel('Predicted Labels')
                plt.ylabel('True Labels')
                plt.title('Confusion Matrix')

                # Save the plot to a file and log it
                plt.savefig("confusion_matrix.png")
                plt.close()
                mlflow.log_artifact("confusion_matrix.png")

                # Log the data as an artifact
                df.to_csv("reddit.csv", index=False)
                mlflow.log_artifact("reddit.csv")

In [77]:
feature=[i for i in range(1000,16000,1000)]
perform_experiment2(max_feature=feature)

2024/10/12 17:36:29 INFO mlflow.tracking._tracking_service.client: 🏃 View run LR-(CV)-1000 at: https://1052d14a61dff596b35e5ac72ec8bc5a57613415:@dagshub.com/Sami606713/youtube_comment_analysis.mlflow/#/experiments/2/runs/2045047e766c44c29618600beee5ebeb.
2024/10/12 17:36:29 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://1052d14a61dff596b35e5ac72ec8bc5a57613415:@dagshub.com/Sami606713/youtube_comment_analysis.mlflow/#/experiments/2.
2024/10/12 17:36:52 INFO mlflow.tracking._tracking_service.client: 🏃 View run LR-(CV)-2000 at: https://1052d14a61dff596b35e5ac72ec8bc5a57613415:@dagshub.com/Sami606713/youtube_comment_analysis.mlflow/#/experiments/2/runs/dd2c0815b8ee410a87b95df467b235db.
2024/10/12 17:36:52 INFO mlflow.tracking._tracking_service.client: 🧪 View experiment at: https://1052d14a61dff596b35e5ac72ec8bc5a57613415:@dagshub.com/Sami606713/youtube_comment_analysis.mlflow/#/experiments/2.
2024/10/12 17:37:09 INFO mlflow.tracking._tracking_service.client: 🏃

# Observation
- In this experiment we will check that.
    - Max feature `15000` will give best result

# Models Experiments
- In this experiment we will test different models

In [83]:
# Set experiment name
def models_experiment(models:dict):
    mlflow.set_experiment("Models Experiment")
    # saperate feature and label
    feature=final_df[['final_comments']]
    target=final_df['category']
    # Split the data into train and test set
    x_train, x_test, y_train, y_test = train_test_split(feature, target, test_size=0.2, random_state=42)

    # Now you can use MLflow to log parameters, metrics, and artifacts
    for model_name,model in models.items():
            transformer = ColumnTransformer(transformers=[
                    ("Encode", CountVectorizer(ngram_range=(1,1),max_features=15000), 'final_comments')
                ], remainder="passthrough")

            run_name = f"{model}"
            with mlflow.start_run(run_name=run_name):
                # Add the transformer and model
                final = Pipeline(steps=[
                    ("transformers", transformer),
                    ("Model", model)
                ])

                # Fit the pipeline
                final.fit(x_train, y_train)

                # Generate the prediction
                y_pred = final.predict(x_test)

                # Calculate the accuracy, precision, recall, confusion matrix
                accuracy = accuracy_score(y_test, y_pred)
                precision = precision_score(y_test, y_pred, average='weighted')
                recall = recall_score(y_test, y_pred, average='weighted')
                confusion_mat = confusion_matrix(y_test, y_pred)
                classification_rep = classification_report(y_test, y_pred, output_dict=True)

                # Log metrics on MLflow
                mlflow.log_metric("Accuracy", accuracy)
                mlflow.log_metric("Precision", precision)  # Fixed typo from 'Precession'
                mlflow.log_metric("Recall", recall)

                # Log detailed metrics from classification report
                for label, metrics in classification_rep.items():
                    if isinstance(metrics, dict):  # for each class or average type
                        for metric_name, metric_value in metrics.items():
                            mlflow.log_metric(f"{label} {metric_name}", metric_value)

                # Log the model
                model = final.named_steps['Model']
                mlflow.sklearn.log_model(model, "Model")

                # Log the parameters
                mlflow.log_params({"model":model_name})

                # Log the confusion matrix plot
                plt.figure(figsize=(8, 6))
                sns.heatmap(confusion_mat, annot=True, fmt='d', cmap='Blues',
                            xticklabels=['Negative', 'Neutral', 'Positive'],
                            yticklabels=['Negative', 'Neutral', 'Positive'])
                plt.xlabel('Predicted Labels')
                plt.ylabel('True Labels')
                plt.title('Confusion Matrix')

                # Save the plot to a file and log it
                plt.savefig("confusion_matrix.png")
                plt.close()
                mlflow.log_artifact("confusion_matrix.png")

                # Log the data as an artifact
                df.to_csv("reddit.csv", index=False)
                mlflow.log_artifact("reddit.csv")

In [84]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,GradientBoostingClassifier,ExtraTreesClassifier

In [102]:
models={
    "LR":LogisticRegression(),
    "RF":RandomForestClassifier(),
    "AdaBoost":AdaBoostClassifier(),
    "GradientBoosting":GradientBoostingClassifier(),
    "ExtraTree":ExtraTreesClassifier(),
    "MultiNomial":MultinomialNB(),
    "Bernilli":BernoulliNB(),
    "SVC":SVC(),
    "DecessionTree":DecisionTreeClassifier(),
    "xgb":XGBClassifier()
}

models_experiment(models=models)

# Observation
- In this experiemt we will test different model.
    - We will see that `Logistic Regression` give best result