In [4]:
import dagshub
import mlflow
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

# Load Dataset

In [2]:
!pip install dagshub

Collecting dagshub
  Downloading dagshub-0.5.9-py3-none-any.whl.metadata (12 kB)
Collecting appdirs>=1.4.4 (from dagshub)
  Downloading appdirs-1.4.4-py2.py3-none-any.whl.metadata (9.0 kB)
Collecting dacite~=1.6.0 (from dagshub)
  Downloading dacite-1.6.0-py3-none-any.whl.metadata (14 kB)
Collecting gql[requests] (from dagshub)
  Downloading gql-3.5.2-py2.py3-none-any.whl.metadata (9.4 kB)
Collecting dataclasses-json (from dagshub)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting treelib>=1.6.4 (from dagshub)
  Downloading treelib-1.7.1-py3-none-any.whl.metadata (1.4 kB)
Collecting pathvalidate>=3.0.0 (from dagshub)
  Downloading pathvalidate-3.2.3-py3-none-any.whl.metadata (12 kB)
Collecting boto3 (from dagshub)
  Downloading boto3-1.37.34-py3-none-any.whl.metadata (6.7 kB)
Collecting semver (from dagshub)
  Downloading semver-3.0.4-py3-none-any.whl.metadata (6.8 kB)
Collecting dagshub-annotation-converter>=0.1.5 (from dagshub)
  Downloading dagshub_an

In [3]:
!pip install mlflow

Collecting mlflow
  Downloading mlflow-2.21.3-py3-none-any.whl.metadata (30 kB)
Collecting mlflow-skinny==2.21.3 (from mlflow)
  Downloading mlflow_skinny-2.21.3-py3-none-any.whl.metadata (31 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.3-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.21.3->mlflow)
  Downloading databricks_sdk-0.50.0-py3-none-any.whl.metadata (38 kB)
Collecting fastapi<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn<1 (from mlflow-skinny==2.21.3->mlflow)
  Downloading uvicorn-0.34.1-py3-none-any.whl.metadata (6.5 k

In [6]:
pwd

'/content'

In [7]:
# load the data
PATH = r"/content/cleaned_data.csv"
df = pd.read_csv(PATH)

df.head()

Unnamed: 0,comment,sentiment,stop_words_count,neg_stop_words_count,word_count,named_sentiment,sentence_length,response,nouns,verbs,adjectives
0,family mormon never tried explain still stare ...,1,13,0,39,positive,Long,0,16,4,2
1,buddhism much lot compatible christianity espe...,1,59,3,196,positive,Very Long,0,58,30,24
2,seriously say thing first get complex explain ...,-1,40,0,86,negative,Very Long,0,18,7,10
3,learned want teach different focus goal wrappi...,0,15,1,29,neutral,Long,0,7,4,2
4,benefit may want read living buddha living chr...,1,45,1,112,positive,Very Long,0,20,19,12


In [8]:
# Only use the comments column for now

final_df = df[['comment','named_sentiment']].copy()

final_df

Unnamed: 0,comment,named_sentiment
0,family mormon never tried explain still stare ...,positive
1,buddhism much lot compatible christianity espe...,positive
2,seriously say thing first get complex explain ...,negative
3,learned want teach different focus goal wrappi...,neutral
4,benefit may want read living buddha living chr...,positive
...,...,...
29746,hona hai vaccination education insurance end m...,neutral
29747,agree push make nation either pity pakistan in...,negative
29748,jesus,neutral
29749,downvote karna tha par upvote hogaya,neutral


In [9]:
# check for missing values

final_df.isna().sum()

Unnamed: 0,0
comment,180
named_sentiment,0


In [10]:
# missing values

final_df.loc[final_df['comment'].isna()]

Unnamed: 0,comment,named_sentiment
255,,neutral
620,,neutral
678,,neutral
685,,neutral
784,,neutral
...,...,...
29411,,neutral
29571,,neutral
29598,,neutral
29702,,neutral


In [11]:
# remove the rows having missing values

print("Rows in data before removing missing values", final_df.shape[0])

final_df = final_df.dropna()

print("Rows in data after removing missing values", final_df.shape[0])

Rows in data before removing missing values 29751
Rows in data after removing missing values 29571


In [12]:
# check for duplicates

final_df.loc[final_df['comment'].duplicated(keep=False)].sort_values('comment')

Unnamed: 0,comment,named_sentiment
11011,aadhar,neutral
15254,aadhar,neutral
20346,aap,neutral
2346,aap,neutral
3110,aap,neutral
...,...,...
28187,yy,neutral
28172,yy,neutral
28081,yy,neutral
2537,zor bolo,neutral


In [13]:
# delete duplicates from data

final_df.drop_duplicates(inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final_df.drop_duplicates(inplace=True)


In [14]:
final_df.isna().sum()

Unnamed: 0,0
comment,0
named_sentiment,0


# Experimentation

In [15]:
# make X and y

X = final_df['comment']
y = final_df['named_sentiment']
X

Unnamed: 0,comment
0,family mormon never tried explain still stare ...
1,buddhism much lot compatible christianity espe...
2,seriously say thing first get complex explain ...
3,learned want teach different focus goal wrappi...
4,benefit may want read living buddha living chr...
...,...
29745,let janta decide ulema cleric
29746,hona hai vaccination education insurance end m...
29747,agree push make nation either pity pakistan in...
29748,jesus


In [16]:
# do train test split

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,
                                                    random_state=42, stratify=y)

print("The number of rows in train data are ",X_train.shape[0])
print("The number of rows in test data are ",X_test.shape[0])

The number of rows in train data are  23321
The number of rows in test data are  5831


In [17]:
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline

In [18]:
TfidfVectorizer().__class__.__name__

'TfidfVectorizer'

In [19]:
# make a function to perform experimentation

def do_experimentation(vectorizer,n_gram):
    max_features = 10000
    if vectorizer == "bow":
        vect = CountVectorizer(max_features=max_features, ngram_range=n_gram)
    elif vectorizer == "tfidf":
        vect = TfidfVectorizer(max_features=max_features, ngram_range=n_gram)

    model_pipe = Pipeline(steps=[
        ("vec",vect),
        ("clf",RandomForestClassifier(random_state=42,n_jobs=-1))
    ])

    # fit the pipeline on training data
    model_pipe.fit(X_train,y_train)
    # calculate predictions
    y_pred = model_pipe.predict(X_test)

    with mlflow.start_run(run_name=f"{vect.__class__.__name__}{n_gram}") as run:
        # accuracy score
        accuracy = accuracy_score(y_test,y_pred)
        mlflow.log_metric("accuracy",accuracy)

        # classification report
        report = classification_report(y_test,y_pred,output_dict=True)
        # log classification report
        for label, metrics in report.items():
            if isinstance(metrics,dict):
                for metric, value in metrics.items():
                    mlflow.log_metric(f"{label}_{metric}",value)

        # log the vectorizer name
        mlflow.log_param("vectorizer",vectorizer)

        # log the ngram range
        mlflow.log_param("ngram",n_gram)

    return "OK"

In [20]:
# set the tracking uri

mlflow.set_tracking_uri("http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/")

In [None]:
# set experiment name

mlflow.set_experiment("Ex-1: Choose Vectorizer and n_gram range")

2025/04/09 11:48:55 INFO mlflow.tracking.fluent: Experiment with name 'Ex-1: Choose Vectorizer and n_gram range' does not exist. Creating a new experiment.


<Experiment: artifact_location='s3://yt-senti-mlfow/625052568021143016', creation_time=1744199335519, experiment_id='625052568021143016', last_update_time=1744199335519, lifecycle_stage='active', name='Ex-1: Choose Vectorizer and n_gram range', tags={}>

In [None]:
vectorizer_types = ["bow","tfidf"]
n_gram_ranges = [(1,1),(1,2),(1,3)]

for vectorizer in vectorizer_types:
    for n_gram in n_gram_ranges:
        print(do_experimentation(vectorizer=vectorizer,
                                n_gram=n_gram))

🏃 View run CountVectorizer(1, 1) at: http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/#/experiments/625052568021143016/runs/8938f5b57802440d9252328c4586ea60
🧪 View experiment at: http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/#/experiments/625052568021143016
OK
🏃 View run CountVectorizer(1, 2) at: http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/#/experiments/625052568021143016/runs/81e149ef07b642dea9fa762c3f623c90
🧪 View experiment at: http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/#/experiments/625052568021143016
OK
🏃 View run CountVectorizer(1, 3) at: http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/#/experiments/625052568021143016/runs/2f2882789fed4c5da683ab1968fb189d
🧪 View experiment at: http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/#/experiments/625052568021143016
OK
🏃 View run TfidfVectorizer(1, 1) at: http://ec2-44-202-16-60.compute-1.amazonaws.com:5000/#/experiments/625052568021143016/runs/6a1444582a36486a9c72662b649c42d1
🧪 View experiment at: h

![image.png](attachment:efe52f48-2921-4f7a-86ae-569bb2636c7b.png)

**Experiments suggest that TfIdf with n_gram range of (1,3) works best for the combination of all the 3 classes**