In [25]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
import nltk

In [18]:
df=pd.read_csv('data.csv')

In [19]:
df.head

<bound method NDFrame.head of                                                 review sentiment
0    Every great gangster movie has under-currents ...  positive
1    I just saw this film last night, and I have to...  positive
2    This film is mildly entertaining if one neglec...  negative
3    Quentin Tarantino's partner in crime Roger Ava...  negative
4    I sat through this on TV hoping because of the...  negative
..                                                 ...       ...
495  I was really disappointed by this movie. Great...  negative
496  This is a great example of a good, dumb movie....  positive
497  Do you know that they want to escavate the Moo...  negative
498  I really wanted to like The Pillow Book. Intri...  negative
499  Steve Biko was a black activist who tried to r...  positive

[500 rows x 2 columns]>

In [22]:
def lemmatization(text):
    lemmatizer = WordNetLemmatizer()
    text=text.split()
    text=[lemmatizer.lemmatize(word) for word in text]
    return ' '.join(text)

In [1]:
def remove_stopwords(text):
    stop_words=set(stopwords.words('english'))
    text=text.split()
    text=[word for word in text if word not in stop_words]
    return ' '.join(text)


In [2]:
def removing_numbers(text):
    text="".join([i for i in text if not i.isdigit()])
    return text

In [9]:
def removing_punctuation(text):
    text=re.sub('[%s]'% re.escape(string.punctuation),' ',text)
    text=re.sub(r"\s+",' ',text).strip()
    return text

In [12]:
def removing_urls(text):
    url_pattern=re.compile(r'https?://\S+|www\.\S+')
    return url_pattern.sub(r'',text)

In [13]:
def lower_case(text):
    text=text.split()
    text=[word.lower() for word in text]
    return ' '.join(text)

In [16]:
def normalize_text(df):
    try:
        df['review']=df['review'].apply(lower_case)
        df['review']=df['review'].apply(removing_urls)
        df['review']=df['review'].apply(removing_punctuation)   
        df['review']=df['review'].apply(removing_numbers)
        df['review']=df['review'].apply(remove_stopwords)
        
        df['review']=df['review'].apply(lemmatization)
        return df
    except Exception as e:
        print(f"Error in normalize_text: {e}")
        raise
        

In [27]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pranto\AppData\Roaming\nltk_data...


True

In [28]:
df=normalize_text(df)
df.head()

Unnamed: 0,review,sentiment
0,every great gangster movie current human drama...,positive
1,saw film last night say loved every minute tak...,positive
2,film mildly entertaining one neglect acknowled...,negative
3,quentin tarantino partner crime roger avary co...,negative
4,sat tv hoping name would worth time dear gussi...,negative


In [29]:
df['sentiment'].value_counts()

sentiment
negative    269
positive    231
Name: count, dtype: int64

In [30]:
x=df['sentiment'].isin(['positive','negative'])
df=df[x]    

In [31]:
df['sentiment']=df['sentiment'].map({'positive':1,'negative':0})
df.head()

Unnamed: 0,review,sentiment
0,every great gangster movie current human drama...,1
1,saw film last night say loved every minute tak...,1
2,film mildly entertaining one neglect acknowled...,0
3,quentin tarantino partner crime roger avary co...,0
4,sat tv hoping name would worth time dear gussi...,0


In [32]:
vectorizer=CountVectorizer(max_features=50)
x=vectorizer.fit_transform(df['review'])
y=df['sentiment']


In [33]:
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2,random_state=42)

In [36]:
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/mondolpranto83/SentimentOps.mlflow')
dagshub.init(repo_owner='mondolpranto83',repo_name='SentimentOps',mlflow=True)

mlflow.set_experiment('Sentiment_Analysis_Experiment')



Open the following link in your browser to authorize the client:
https://dagshub.com/login/oauth/authorize?state=6bdaf60e-7905-4678-b44b-281989317b8a&client_id=32b60ba385aa7cecf24046d8195a71c07dd345d9657977863b52e7748e0f0f28&middleman_request_id=d3fadc62adeefb7f862537e38194524086f3acf7d49f72ca4eb001c9ed605b34




2026/01/30 12:39:10 INFO mlflow.tracking.fluent: Experiment with name 'Sentiment_Analysis_Experiment' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/06b0cbe1d1c8430da760d197216ad29c', creation_time=1769794752953, experiment_id='0', last_update_time=1769794752953, lifecycle_stage='active', name='Sentiment_Analysis_Experiment', tags={}>

In [38]:
import mlflow
import logging
import os
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score 

logging.basicConfig(level=logging.INFO,format='%(asctime)s - %(levelname)s - %(message)s')
logging.info("Starting model training and evaluation")

with mlflow.start_run():
    Start_time=time.time()
    
    try:
        logging.info("Logging preprocessing parameters...")
        mlflow.log_param('vectorizer','Bag of words')
        mlflow.log_param('vectorizer_max_features',50)
        mlflow.log_param('test_size',0.2)
        
        logging.info("Initializing and training the Logistic Regression model...")
        model=LogisticRegression(max_iter=1000)
        model.fit(x_train,y_train)
        logging.info("Model training completed.")
        mlflow.log_param('model','Logistic Regression') 
        logging.info("Making pred .......")
        
        logging.info("Evaluating model performance...")
        y_pred=model.predict(x_test)
        accuracy=accuracy_score(y_test,y_pred)
        precision=precision_score(y_test,y_pred)
        recall=recall_score(y_test,y_pred)
        f1=f1_score(y_test,y_pred)
        logging.info("Saving and logging metrics...")
        
        mlflow.log_metric('accuracy',accuracy)
        mlflow.log_metric('precision',precision)
        mlflow.log_metric('recall',recall)
        mlflow.log_metric('f1_score',f1)
        
        logging.info("Logging the trained model...")
        mlflow.sklearn.log_model(model,'sentiment_analysis_model')
        
        end_time=time.time()
        elapsed_time=end_time - Start_time
        logging.info(f"Total time taken: {elapsed_time} seconds")
               
        logging.info("Model training and evaluation completed successfully.")
        logging.info(f"Accuracy: {accuracy}, Precision: {precision}, Recall: {recall}, F1 Score: {f1}")
    except Exception as e:
        logging.error(f"An error occurred: {e}",exc_info=True)
        raise
    
        
        

2026-01-30 12:55:55,191 - INFO - Starting model training and evaluation
2026-01-30 12:55:55,515 - INFO - Logging preprocessing parameters...
2026-01-30 12:55:56,510 - INFO - Initializing and training the Logistic Regression model...
2026-01-30 12:55:56,532 - INFO - Model training completed.
2026-01-30 12:55:56,837 - INFO - Making pred .......
2026-01-30 12:55:56,839 - INFO - Evaluating model performance...
2026-01-30 12:55:56,882 - INFO - Saving and logging metrics...
2026-01-30 12:55:58,358 - INFO - Logging the trained model...
  flavor.save_model(path=local_path, mlflow_model=mlflow_model, **kwargs)
2026-01-30 12:56:13,467 - INFO - Total time taken: 17.95179533958435 seconds
2026-01-30 12:56:13,467 - INFO - Model training and evaluation completed successfully.
2026-01-30 12:56:13,477 - INFO - Accuracy: 0.68, Precision: 0.7142857142857143, Recall: 0.6, F1 Score: 0.6521739130434783


üèÉ View run aged-kit-215 at: https://dagshub.com/mondolpranto83/SentimentOps.mlflow/#/experiments/0/runs/b18afa07acf643b1973cbfe274683da4
üß™ View experiment at: https://dagshub.com/mondolpranto83/SentimentOps.mlflow/#/experiments/0
