## 1. Setup and Environment

### 1.1 Install Dependencies

In [1]:
%pip install -q mlflow

Note: you may need to restart the kernel to use updated packages.




In [2]:
%pip install -q boto3 awscli

Note: you may need to restart the kernel to use updated packages.




### 1.2 Configure MLflow Tracking

In [3]:
import mlflow

# Set the remote tracking server URI
mlflow.set_tracking_uri("http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/")

In [4]:
# Set or create an experiment for the baseline model
mlflow.set_experiment("RF Baseline Model 1")

<Experiment: artifact_location='s3://mlfow-bucket-2025/424422613538872822', creation_time=1762870848089, experiment_id='424422613538872822', last_update_time=1762870848089, lifecycle_stage='active', name='RF Baseline Model 1', tags={'mlflow.experimentKind': 'custom_model_development'}>

## 2. Data Loading, Cleaning, and Preprocessing

### 2.1 Import Libraries (Data & NLP)

In [5]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [6]:
# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### 2.2 Load Raw Data

In [7]:
df = pd.read_csv('https://raw.githubusercontent.com/Himanshu-1703/reddit-sentiment-analysis/refs/heads/main/data/reddit.csv')
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


### 2.3 Initial Data Cleaning

In [8]:
df.dropna(inplace=True)

In [9]:
print(df.isnull().sum())

clean_comment    0
category         0
dtype: int64


In [10]:
df.drop_duplicates(inplace=True)

In [11]:
df = df[~(df['clean_comment'].str.strip() == '')]

### 2.4 Define and Apply Text Preprocessing Function

In [12]:
# Define the preprocessing function
def preprocess_comment(comment):
    # Convert to lowercase
    comment = comment.lower()

    # Remove trailing and leading whitespaces
    comment = comment.strip()

    # Remove newline characters
    comment = re.sub(r'\n', ' ', comment)

    # Remove non-alphanumeric characters, except punctuation
    comment = re.sub(r'[^A-Za-z0-9\s!?.,]', '', comment)

    # Remove stopwords but retain important ones for sentiment analysis
    stop_words = set(stopwords.words('english')) - {'not', 'but', 'however', 'no', 'yet'}
    comment = ' '.join([word for word in comment.split() if word not in stop_words])

    # Lemmatize the words
    lemmatizer = WordNetLemmatizer()
    comment = ' '.join([lemmatizer.lemmatize(word) for word in comment.split()])

    return comment

In [13]:
# Apply the preprocessing function to the 'clean_comment' column
df['clean_comment'] = df['clean_comment'].apply(preprocess_comment)

In [14]:
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


## 3. Feature Engineering and Data Split

### 3.1 Vectorization using CountVectorizer (Bag of Words)

In [15]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

# Vectorize the comments using Bag of Words (CountVectorizer)
vectorizer = CountVectorizer(max_features=10000)  # Limiting features to 10000

In [16]:
X = vectorizer.fit_transform(df['clean_comment']).toarray()
y = df['category']

In [17]:
X.shape

(36793, 10000)

In [18]:
y.shape

(36793,)

In [19]:
print(df.isnull().sum())

clean_comment    0
category         0
dtype: int64


### 3.2 Train/Test Split

In [20]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

## 4. Model Training and MLflow Logging

In [21]:
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
import os
import mlflow.sklearn
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Define and train a Random Forest baseline model and log results to MLflow
with mlflow.start_run() as run:
    # Log a description for the run
    mlflow.set_tag("mlflow.runName", "RandomForest_Baseline_TrainTestSplit")
    mlflow.set_tag("experiment_type", "baseline")
    mlflow.set_tag("model_type", "RandomForestClassifier")

    # Add a description
    mlflow.set_tag("description", "Baseline RandomForest model for sentiment analysis using Bag of Words (BoW) with a simple train-test split")

    # Log parameters for the vectorizer
    mlflow.log_param("vectorizer_type", "CountVectorizer")
    mlflow.log_param("vectorizer_max_features", vectorizer.max_features)

    # Log Random Forest parameters
    n_estimators = 200
    max_depth = 15

    mlflow.log_param("n_estimators", n_estimators)
    mlflow.log_param("max_depth", max_depth)

    # Initialize and train the model
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    model.fit(X_train, y_train)

    # Make predictions on the test set
    y_pred = model.predict(X_test)

    # Calculate and log metrics
    accuracy = accuracy_score(y_test, y_pred)
    mlflow.log_metric("accuracy", accuracy)

    classification_rep = classification_report(y_test, y_pred, output_dict=True)

    for label, metrics in classification_rep.items():
        if isinstance(metrics, dict):  # For precision, recall, f1-score, etc.
            for metric, value in metrics.items():
                mlflow.log_metric(f"{label}_{metric}", value)

    # Confusion matrix plot
    conf_matrix = confusion_matrix(y_test, y_pred)
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_matrix, annot=True, fmt="d", cmap="Blues")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.title("Confusion Matrix")

    # Save and log the confusion matrix plot (use absolute path to avoid cwd issues)
    confusion_path = os.path.abspath("confusion_matrix.png")
    plt.savefig(confusion_path)
    plt.close()

    if os.path.exists(confusion_path):
        try:
            mlflow.log_artifact(confusion_path)
        except Exception as e:
            print(f"Warning: failed to log confusion matrix artifact: {e}")
    else:
        print(f"Error: confusion matrix file not found at {confusion_path}")

    # Log the Random Forest model
    try:
        mlflow.sklearn.log_model(model, name="random_forest_model")
    except Exception as e:
        print(f"Warning: failed to log model: {e}")

    # Log the dataset artifact
    dataset_path = os.path.abspath("dataset.csv")
    df.to_csv(dataset_path, index=False)

    if os.path.exists(dataset_path):
        try:
            mlflow.log_artifact(dataset_path)
        except Exception as e:
            print(f"Warning: failed to log dataset artifact: {e}")
    else:
        print(f"Error: dataset file not found at {dataset_path}")

# Display final accuracy
print(f"Accuracy: {accuracy}")




üèÉ View run RandomForest_Baseline_TrainTestSplit at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/424422613538872822/runs/b660663da9794d23bb503b4eac640d7f
üß™ View experiment at: http://ec2-54-211-18-166.compute-1.amazonaws.com:5000/#/experiments/424422613538872822
Accuracy: 0.6511754314444897


## 5. Evaluation and Artifacts

### 5.1 Classification Report

In [23]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

          -1       1.00      0.01      0.01      1650
           0       0.68      0.82      0.75      2555
           1       0.63      0.85      0.72      3154

    accuracy                           0.65      7359
   macro avg       0.77      0.56      0.49      7359
weighted avg       0.73      0.65      0.57      7359



### 5.2 Save Preprocessed Data for Next Experiments

In [24]:
print(df.isnull().sum())

clean_comment    0
category         0
dtype: int64


In [25]:
df.shape

(36793, 2)

In [30]:
# Save the cleaned and preprocessed DataFrame to a new CSV file
df.to_csv('../data/reddit_preprocessing.csv', index=False)

In [31]:
# Verify the saved file
pd.read_csv('../data/reddit_preprocessing.csv').head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [32]:
df.shape

(36793, 2)

In [33]:
print(df.isnull().sum())

clean_comment    0
category         0
dtype: int64
