In [None]:
!pip install mlflow



In [None]:
!pip install awscli

Collecting awscli
  Using cached awscli-1.44.34-py3-none-any.whl.metadata (11 kB)
Collecting botocore==1.42.44 (from awscli)
  Downloading botocore-1.42.44-py3-none-any.whl.metadata (5.9 kB)
Collecting docutils<=0.19,>=0.18.1 (from awscli)
  Downloading docutils-0.19-py3-none-any.whl.metadata (2.7 kB)
Collecting s3transfer<0.17.0,>=0.16.0 (from awscli)
  Downloading s3transfer-0.16.0-py3-none-any.whl.metadata (1.7 kB)
Collecting colorama<0.4.7,>=0.2.5 (from awscli)
  Downloading colorama-0.4.6-py2.py3-none-any.whl.metadata (17 kB)
Collecting rsa<4.8,>=3.1.2 (from awscli)
  Downloading rsa-4.7.2-py3-none-any.whl.metadata (3.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from botocore==1.42.44->awscli)
  Downloading jmespath-1.1.0-py3-none-any.whl.metadata (7.6 kB)
Downloading awscli-1.44.34-py3-none-any.whl (4.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.6/4.6 MB[0m [31m34.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.42.44-py3-none-any.whl (14.6 MB)


In [None]:
!pip install boto3

Collecting boto3
  Downloading boto3-1.42.44-py3-none-any.whl.metadata (6.8 kB)
Downloading boto3-1.42.44-py3-none-any.whl (140 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.6/140.6 kB[0m [31m3.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: boto3
Successfully installed boto3-1.42.44


In [None]:
# 1. Install MLflow
!pip install mlflow --quiet

import mlflow
from google.colab import output

# 2. Log your experiment
# Note: In Colab, it's safer to log to local files ("file:./mlruns") rather than
# starting a background HTTP server just to log data. The UI will read these files later.
mlflow.set_tracking_uri("file:./mlruns")

with mlflow.start_run():
    mlflow.log_param("param1", 15)
    mlflow.log_metric("metric1", 0.3)
    print("Run completed and logged locally.")

# 3. Run the MLflow UI in the background
# We run this as a system command with '&' to keep it running in the background
get_ipython().system_raw("mlflow ui --port 5000 &")

# 4. Expose the UI through Google Colab's reverse proxy
print("Click the link below to view the MLflow UI:")
output.serve_kernel_port_as_window(5001)

Run completed and logged locally.
Click the link below to view the MLflow UI:
Try `serve_kernel_port_as_iframe` instead. [0m


<IPython.core.display.Javascript object>

In [None]:
import numpy as np
import pandas as pd
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

In [None]:
df.dropna(inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df = df[~(df.clean_comment.str.strip() == "")]

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def preprocess_comment(comment):

  #Convert to lowercase
  comment = comment.lower()

  #Rmove the trailing and leading whitespaces
  comment = comment.strip()

  #Remove newline character
  comment = re.sub('\n',' ',comment)

  #Remove non-alphanumeric character except pucntuation
  comment = re.sub(r'[^a-zA-Z0-9\s!?.,]','',comment)

  #Remove stopwords but retain the important ones
  stop_words = set(stopwords.words('english')) - {'not','no','but','however','yet'}
  comment = ' '.join([word for word in comment.split() if word not in stop_words])

  #Lemmatize the
  lemmatizer = WordNetLemmatizer()
  comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

  return comment

In [None]:
#Apply the preprocessing step to clean the commment
df['clean_comment'] = df.clean_comment.apply(preprocess_comment)

In [None]:
df.head()

In [None]:
import mlflow
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, recall_score, precision_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_predict, StratifiedKFold
import seaborn as sns
import matplotlib.pyplot as plt

In [None]:
# step1 Vectorize the comment by bow
vectorizer = CountVectorizer(max_features=10000)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.clean_comment, df.category, test_size=0.2, random_state=42)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [None]:
# Set or create an experiment
mlflow.set_experiment("RF Baseline")

In [None]:
with mlflow.start_run() as run:
  #Log a description for a run
  mlflow.set_tag("mlflow.runName", "RandomForest_Baseline_TrainTestSplit")
  mlflow.set_tag('experiment_type','baseline')
  mlflow.set_tag('model_type','RandomForestClassifier')

  #Add a description
  mlflow.set_tag("description", "Baseline RandomForest model for sentiment analysis using Bag of Words (BoW) with a simple train-test split")

  #Log parameter for vectorizer
  mlflow.log_param("vectorizer_type",'CountVectorizer')
  mlflow.log_param("vecorizer_max_features",vectorizer.max_features)

  # Log Random Forest parameters
  n_estimators = 200
  max_depth = 15

  mlflow.log_param("n_estimators", n_estimators)
  mlflow.log_param("max_depth", max_depth)

  # Initialize and train the model
  model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
  model.fit(X_train_vec, y_train)

  # Make predictions on the test set
  y_pred = model.predict(X_test_vec)

  # Log metrics for each class and accuracy
  accuracy = accuracy_score(y_test, y_pred)
  mlflow.log_metric("accuracy", accuracy)

  classification_rep = classification_report(y_test, y_pred, output_dict=True)

  for label, metrics in classification_rep.items():
    if isinstance(metrics, dict):
      for metric_name, metric_value in metrics.items():
        mlflow.log_metric(f"{label}_{metric_name}",metric_value)

  # Confusion matrix plot
  conf_matrix = confusion_matrix(y_test, y_pred)
  plt.figure(figsize=(8, 6))
  sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
  plt.xlabel("Predicted")
  plt.ylabel("Actual")
  plt.title("Confusion Matrix")

  # Save and log the confusion matrix plot
  plt.savefig("confusion_matrix.png")
  mlflow.log_artifact("/content/confusion_matrix.png")

  # Log the Random Forest model
  mlflow.sklearn.log_model(model, "random_forest_model")

  # Optionally log the dataset itself (if it's small enough)
  df.to_csv("dataset.csv", index=False)
  mlflow.log_artifact("/content/dataset.csv")

# Display final accuracy
print(f"Accuracy: {accuracy}")

In [None]:
print(classification_report(y_test, y_pred))