In [7]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
!pip install pyspark mlflow
import kagglehub
from pyspark.ml import Pipeline
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
import pyspark.sql.functions as F


Collecting dagshub
  Downloading dagshub-0.6.3-py3-none-any.whl.metadata (12 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-4.0.0-py3-none-any.whl.metadata (10 kB)
Collecting dataclasses-json (from dagshub)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.8.0-py3-none-any.whl.metadata (3.3 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.3.1-py3-none-any.whl.metadata (12 kB)
Collecting boto3 (from dagshub)
  Downloading boto3-1.40.38-py3-none-any.whl.metadata (6.7 kB)
Collecting semver (from dagshub)
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Collecting dagshub-annotation-converter>=0.1.12 (from dagshub)
  Downloading dagshub_annota

In [2]:


# Download latest version
path = kagglehub.dataset_download("bittlingmayer/amazonreviews")

print("Path to dataset files:", path)

Using Colab cache for faster access to the 'amazonreviews' dataset.
Path to dataset files: /kaggle/input/amazonreviews


In [3]:
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import mlflow

# Initialize Spark
spark = SparkSession.builder.appName("AmazonSentiment").getOrCreate()

# Load and parse the data (assuming file is at /content/train.ft.txt.bz2)
df = spark.read.option("delimiter", " ").csv("/kaggle/input/amazonreviews/train.ft.txt.bz2")
df = df.withColumn("label", F.when(F.col("_c0") == "__label__2", 1).otherwise(0)) \
       .withColumn("text", F.col("_c1")) \
       .select("label", "text")

df.show(5, truncate=50)

+-----+---------+
|label|     text|
+-----+---------+
|    1|  Stuning|
|    1|      The|
|    1|Amazing!:|
|    1|Excellent|
|    1|Remember,|
+-----+---------+
only showing top 5 rows



In [None]:


# Using the full dataset
(train_data, test_data) = df.randomSplit([0.8, 0.2], seed=42)

# --- SOLUTION 1: CACHE THE DATA ---
train_data.cache()
test_data.cache()

# Define pipeline stages
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
hashingTF = HashingTF(inputCol="filtered_words", outputCol="rawFeatures", numFeatures=20000)
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(featuresCol="features", labelCol="label")

# Create the full pipeline
full_pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, idf, lr])

with mlflow.start_run():
    mlflow.log_param("regParam", lr.getRegParam())
    mlflow.log_param("maxIter", lr.getMaxIter())

    # Train the model
    print("Fitting the model...")
    model = full_pipeline.fit(train_data)

    # Make predictions
    print("Transforming test data...")
    predictions = model.transform(test_data)

    # --- SOLUTION 1: CACHE PREDICTIONS ---
    predictions.cache()
    print("Predictions cached.")

    # --- LOG METRICS ---
    print("Evaluating metrics...")
    evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label")
    auc = evaluator.evaluate(predictions, {evaluator.metricName: "areaUnderROC"})

    # --- SOLUTION 2: EFFICIENT ACCURACY CALCULATION ---
    # Cast boolean to integer (true=1, false=0) and take the average
    accuracy_df = predictions.withColumn('correct', F.when(F.col('label') == F.col('prediction'), 1).otherwise(0))
    accuracy = accuracy_df.select(F.avg('correct')).first()[0]

    mlflow.log_metric("AUC", auc)
    mlflow.log_metric("Accuracy", accuracy)

    # Unpersist the cached dataframes to free up memory
    train_data.unpersist()
    test_data.unpersist()
    predictions.unpersist()

    # --- LOG MODEL ---
    print("Logging model...")
    mlflow.spark.log_model(model, "spark-lr-model")

    print(f"Model logged! AUC: {auc:.4f}, Accuracy: {accuracy:.4f}")

In [None]:
import mlflow
from pyspark.ml import Pipeline
from pyspark.ml.feature import (
    Tokenizer, StopWordsRemover, HashingTF, IDF, NGram, VectorAssembler
)
# Make sure to import the model you are actually using
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import BinaryClassificationEvaluator

# --- Use a small data sample for testing ---
# Ensure df and spark are defined from your earlier cells
df_sampled = df.sample(fraction=0.01, seed=42)
(train_data, test_data) = df_sampled.randomSplit([0.8, 0.2], seed=42)

# --- 1. Define Feature Engineering Stages ---
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
ngram = NGram(n=2, inputCol="filtered_words", outputCol="ngrams")
hashingTF_words = HashingTF(inputCol="filtered_words", outputCol="hashed_words", numFeatures=5000)
hashingTF_ngrams = HashingTF(inputCol="ngrams", outputCol="hashed_ngrams", numFeatures=5000)
assembler = VectorAssembler(inputCols=["hashed_words", "hashed_ngrams"], outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")

# --- 2. Define the Model (use the fast one for testing) ---
lr = LogisticRegression(featuresCol="features", labelCol="label")

# --- 3. Assemble the Full Pipeline (define it only ONCE) ---
pipeline = Pipeline(stages=[
    tokenizer, remover, ngram,
    hashingTF_words, hashingTF_ngrams,
    assembler, idf,
    lr  # <-- Use the faster model in the pipeline
])

# --- 4. Set up Hyperparameter Tuning ---
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.01])  # Add a second option to see tuning work
             .addGrid(lr.maxIter, [10])
             .build())

evaluator = BinaryClassificationEvaluator(rawPredictionCol="rawPrediction", labelCol="label", metricName="areaUnderROC")

crossval = CrossValidator(estimator=pipeline,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=2)

# --- 5. Run Training and Log with MLflow ---
with mlflow.start_run():
    print("Starting cross-validation...")
    cvModel = crossval.fit(train_data)
    print("Cross-validation complete.")

    best_model = cvModel.bestModel
    predictions = best_model.transform(test_data)

    auc = evaluator.evaluate(predictions)
    accuracy = predictions.filter(predictions.label == predictions.prediction).count() / float(predictions.count())

    print(f"Best Model Found! AUC: {auc:.4f}, Accuracy: {accuracy:.4f}")

    # --- FIX: Log the CORRECT parameters for the LogisticRegression model ---
    best_lr_model = best_model.stages[-1]
    mlflow.log_param("best_regParam", best_lr_model.getRegParam())
    mlflow.log_param("best_maxIter", best_lr_model.getMaxIter())

    mlflow.log_metric("AUC", auc)
    mlflow.log_metric("Accuracy", accuracy)

    input_example = test_data.limit(5).toPandas()
    mlflow.spark.log_model(
        best_model,
        "spark-lr-cv-model",
        input_example=input_example
    )

In [None]:
# # Install the pyngrok library
# !pip install pyngrok

# # Kill any existing mlflow and ngrok processes to start fresh
# !killall mlflow
# !killall ngrok

# from pyngrok import ngrok
# import os

# # Terminate open tunnels if any exist
# ngrok.kill()

# # Set your ngrok authtoken (optional but recommended, get one from ngrok.com)
# ngrok.set_auth_token("338tbSnswQ3CUcZHfDrk8s0z4ra_7me8sxm24gMs6nV73Txon")

# # Set the MLFLOW_TRACKING_URI so MLflow knows where to store experiments
# # This will create an mlruns directory in your Colab environment
# os.environ["MLFLOW_TRACKING_URI"] = "mlruns"

# # Run MLflow UI in the background
# get_ipython().system_raw("mlflow ui --port 5000 &")

# # Create a public URL to the local port 5000
# public_url = ngrok.connect(5000)
# print("✅ MLflow UI is running. Access it at:")
# print(public_url)

In [None]:
# import mlflow
# import tensorflow as tf
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TextVectorization, Dropout
# from tensorflow.keras.callbacks import EarlyStopping
# import numpy as np
# import re
# import string
# # --- NLTK Imports for Lemmatization ---
# import nltk
# from nltk.stem import WordNetLemmatizer
# from nltk.tokenize import word_tokenize

# # --- Download NLTK data (only needs to be done once) ---
# nltk.download('punkt')
# nltk.download('wordnet')

# # --- Connect to DAGsHub (ensure this is configured) ---
# import dagshub
# dagshub.init(repo_owner='Praproop14_35', repo_name='sentiment-analysis', mlflow=True) # I corrected your username based on previous screenshots
# mlflow.set_experiment("Optimized-BiLSTM-with-Lemma")
# mlflow.tensorflow.autolog()

# # --- 1. Upgraded Text Cleaning with Lemmatization ---
# lemmatizer = WordNetLemmatizer()

# def clean_text_and_lemmatize(text):
#     text = text.lower()
#     text = re.sub(r'<.*?>', '', text)
#     text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)
#     text = re.sub(r'\d+', '', text)
#     text = re.sub(r'\s+', ' ', text).strip()

#     # --- NEW: Tokenize and Lemmatize ---
#     tokens = word_tokenize(text)
#     lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]
#     return " ".join(lemmatized_tokens)

# # Prepare data and apply the new cleaning function
# df_sampled = df.sample(fraction=0.02, seed=42)
# train_pdf = df_sampled.toPandas()
# train_pdf['text'] = train_pdf['text'].apply(clean_text_and_lemmatize)

# X_train = train_pdf['text'].to_numpy()
# y_train = train_pdf['label'].to_numpy()

# # --- 2. Build the Keras Model (architecture is the same) ---
# max_features = 10000
# sequence_length = 250
# vectorize_layer = TextVectorization(
#     max_tokens=max_features,
#     output_mode='int',
#     output_sequence_length=sequence_length)
# vectorize_layer.adapt(X_train)

# model = Sequential([
#     vectorize_layer,
#     Embedding(max_features + 1, 256),
#     Bidirectional(LSTM(128, return_sequences=True)),
#     Dropout(0.3),
#     Bidirectional(LSTM(64)),
#     Dropout(0.3),
#     Dense(64, activation='relu'),
#     Dense(1, activation='sigmoid')
# ])

# model.compile(optimizer='adam',
#               loss='binary_crossentropy',
#               metrics=['accuracy'])

# # --- 3. Refine the Training Process ---
# early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# with mlflow.start_run():
#     print("Starting optimized BiLSTM model training with lemmatization...")

#     model.fit(X_train, y_train,
#               epochs=15,
#               validation_split=0.2,
#               batch_size=32,
#               callbacks=[early_stopping])

#     print("\nTraining complete.")

In [21]:


import os
import mlflow
import mlflow

# Set the tracking URI to Databricks
mlflow.set_tracking_uri("databricks")

# Set the experiment by its path
mlflow.set_experiment("/Users/anandpraroop@gmail.com/BiLSTM-Colab-Experiment")





MlflowException: Reading Databricks credential configuration failed with MLflow tracking URI 'databricks'. Please ensure that the 'databricks-sdk' PyPI library is installed, the tracking URI is set correctly, and Databricks authentication is properly configured. The tracking URI can be either 'databricks' (using profile name specified by 'DATABRICKS_CONFIG_PROFILE' environment variable or using 'DEFAULT' authentication profile if 'DATABRICKS_CONFIG_PROFILE' environment variable does not exist) or 'databricks://{profile}'. You can configure Databricks authentication in several ways, for example by specifying environment variables (e.g. DATABRICKS_HOST + DATABRICKS_TOKEN) or logging in using 'databricks auth login'. 
For details on configuring Databricks authentication, please refer to 'https://docs.databricks.com/en/dev-tools/auth/index.html#unified-auth'.

In [6]:
# Google Colab Notebook
# This notebook demonstrates how to train a Bidirectional LSTM model and track
# the results in a Databricks MLflow server from a Google Colab environment.

# COMMAND ----------
# Cell 1: Install Libraries and Setup
# Install necessary libraries for Kaggle data access, MLflow, and Databricks integration.

!pip install -q kagglehub databricks-cli mlflow tensorflow
print("Libraries installed.")

# COMMAND ----------
# Cell 2: Setup Local MLflow Tracking
#
# This cell configures MLflow to log runs to the local file system.
# No Databricks host or token is needed.

import mlflow
import os
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, TextVectorization
import numpy as np
from google.colab import userdata

# MLflow will now log all runs and artifacts to a local directory called 'mlruns'.
# The tracking URI is explicitly set to the local file system to make this clear.
mlflow.set_tracking_uri("file:///mlruns")

# Set a unique experiment name for your local runs.
mlflow.set_experiment("BiLSTM-Local-Training")
print(f"Successfully connected to MLflow tracking URI: {mlflow.get_tracking_uri()}")
print(f"MLflow experiment set to: BiLSTM-Local-Training")

# COMMAND ----------
# Cell 3: Data Loading from Kaggle
#
# This cell uses kagglehub to directly download the data.

import kagglehub
import pandas as pd
import bz2
import io

print("Downloading data from Kaggle...")

# Download the dataset
path = kagglehub.dataset_download("bittlingmayer/amazonreviews")
file_path = os.path.join(path, "train.ft.txt.bz2")

# Read the bzip2 compressed file into a pandas DataFrame
with bz2.open(file_path, "rt", encoding="utf-8") as bz2_file:
    df_raw = pd.read_csv(io.StringIO(bz2_file.read()), sep=" ", header=None)

# Pre-process the raw data and clean column names
df_raw.columns = ['label', 'text']
df_raw['label'] = df_raw['label'].str.replace('__label__', '').astype(int) - 1
df_raw['text'] = df_raw['text'].fillna('')
df_raw = df_raw[['label', 'text']]

print("Data loaded successfully.")
print("Sample of data:")
print(df_raw.head())

# Sample the data to reduce training time for a quick test
df_sampled = df_raw.sample(frac=0.02, random_state=42)
X_train = df_sampled['text'].to_numpy()
y_train = df_sampled['label'].to_numpy()

print(f"Sampled {len(X_train)} records for training.")

# COMMAND ----------
# Cell 4: Build the Keras Model

# Define the text vectorization layer parameters
max_features = 10000
sequence_length = 250

# Initialize and adapt the vectorization layer
vectorize_layer = TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)
vectorize_layer.adapt(X_train)

# Build the Sequential model
model = Sequential([
    vectorize_layer,
    Embedding(max_features + 1, 128),
    Bidirectional(LSTM(64)),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.summary()

# COMMAND ----------
# Cell 5: Train and Manually Log Metrics to MLflow

with mlflow.start_run(run_name="local_manual_logging_run") as run:
    print("Starting BiLSTM model training with manual logging...")

    # Train the model and get the training history
    history = model.fit(X_train, y_train,
                        epochs=3,
                        validation_split=0.2,
                        batch_size=32)

    # Manually log metrics from the history object for each epoch
    for i in range(len(history.history['accuracy'])):
        mlflow.log_metric("accuracy", history.history['accuracy'][i], step=i)
        mlflow.log_metric("loss", history.history['loss'][i], step=i)
        mlflow.log_metric("val_accuracy", history.history['val_accuracy'][i], step=i)
        mlflow.log_metric("val_loss", history.history['val_loss'][i], step=i)

    # Log the final trained model
    mlflow.keras.log_model(model, "bilstm-model")

    print("\nTraining complete and manually logged to local MLflow.")

# COMMAND ----------
# Cell 6: View the MLflow UI
#
# After this cell finishes, you need to open a terminal in Colab and run
# the following command to start the MLflow UI.
#
# !mlflow ui --backend-store-uri file:///mlruns

print("To view the MLflow UI, run `!mlflow ui` in a new Colab cell or terminal.")
print("You will need to use a tool like ngrok to expose the web page publicly, if needed.")


Libraries installed.


2025/09/25 11:12:10 INFO mlflow.tracking.fluent: Experiment with name 'BiLSTM-Local-Training' does not exist. Creating a new experiment.


Successfully connected to MLflow tracking URI: file:///mlruns
MLflow experiment set to: BiLSTM-Local-Training
Downloading data from Kaggle...
Using Colab cache for faster access to the 'amazonreviews' dataset.


ParserError: Error tokenizing data. C error: Expected 81 fields in line 2, saw 98
