In [6]:
import mlflow
import pandas as pd
import mlflow.sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import pandas as pd
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import numpy as np
from nltk.tokenize import word_tokenize

In [14]:
import nltk
nltk.download('wordnet')


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VAIBHAVRAI\AppData\Roaming\nltk_data...


True

In [18]:
df = pd.read_csv(r"C:\Users\VAIBHAVRAI\OneDrive\Desktop\mlops-main\MLOPS-Working-proj\Mlops-project-Main\IMDB.csv")
df = df.sample(500)
df.to_csv("real_data.csv", index= False)
df.head()

Unnamed: 0,review,sentiment
786,Oh boy! Oh boy! On the cover of worn out VHS h...,negative
294,"If I had never seen the first Road House, then...",negative
424,I thought this movie was pretty good. Some par...,positive
62,This movie is about a man who likes to blow hi...,negative
805,"Sarafina was a fun movie, and some of the song...",positive


In [7]:
def lemmitization(text):
    """
    This function is used to perform lemmitization on the text data
    """
    lemitizer = WordNetLemmatizer()
    words = word_tokenize(text)
    words = [lemitizer.lemmatize(word) for word in words]
    return " ".join(words)
    

In [8]:
def remove_the_stop_words(text):
    """ 
    This function is used to remove the stop words from the text data
    """
    words = word_tokenize(text)
    words = [word for word in words if word not in stopwords.words("english")]
    return " ".join(words)

In [10]:
def removing_the_numerical_values(text):
    """ 
    This function is used to remove the numerical values from the text data
    
    """
    words = word_tokenize(text)
    words = [word for word in words if not word.isnumeric()]
    return " ".join(words)

In [9]:
def lower_case(text):
    """
    This function is used to convert the text data into lower case
    """
    words = word_tokenize(text)
    words = [word.lower() for word in words]
    return " ".join(words)

In [11]:
def removing_the_urls(text):
    """
    This function is used to remove the urls from the text data
    """
    return re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)

In [12]:
def normalize_text_of_the_dataframe(text):
    """
    This function is used to normalize the text data of the dataframe basically the review section using the above functions 
    """
    try:
        df['review'] = df['review'].apply(lower_case)
        df['review'] = df['review'].apply(removing_the_urls)
        df['review'] = df['review'].apply(removing_the_numerical_values)
        df['review'] = df['review'].apply(remove_the_stop_words)
        df['review'] = df['review'].apply(lemmitization)
        return df
    except Exception as e:
        print(f"Error during the normalization of the text data {e}")
        raise 

In [15]:
df  = normalize_text_of_the_dataframe(df)
df.head()

Unnamed: 0,review,sentiment
41,"piece crap might acclaimed year ago , one raci...",negative
976,early film flaw -- predictable plot overlong s...,positive
965,movie true reflection australian resourcefulne...,positive
370,"men , love police movie filled action , shooti...",negative
939,plenty comment already posted saying exactly f...,negative


In [19]:
df['sentiment'].value_counts()


sentiment
negative    267
positive    233
Name: count, dtype: int64

In [22]:
x = df['sentiment'].isin(['positive', 'negative'])
df = df[x]

In [23]:
df['sentiment'] = df['sentiment'].map({"positive": 1, "negative": 0})
df.head()

Unnamed: 0,review,sentiment
786,Oh boy! Oh boy! On the cover of worn out VHS h...,0
294,"If I had never seen the first Road House, then...",0
424,I thought this movie was pretty good. Some par...,1
62,This movie is about a man who likes to blow hi...,0
805,"Sarafina was a fun movie, and some of the song...",1


In [24]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [None]:
vectorizers = CountVectorizer(max_features= 200)
X = vectorizers.fit_transform(df['review'])
y = df['sentiment']

In [26]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [28]:
import dagshub
mlflow.set_tracking_uri('https://dagshub.com/VaibhavRai24/MLOPS-Working-proj.mlflow')
dagshub.init(repo_owner= 'VaibhavRai24', repo_name= 'MLOPS-Working-proj', mlflow= True)
mlflow.set_experiment("Sentimental Analysis on the IMDB DATASET")


2025/03/01 19:27:08 INFO mlflow.tracking.fluent: Experiment with name 'Sentimental Analysis on the IMDB DATASET' does not exist. Creating a new experiment.


<Experiment: artifact_location='mlflow-artifacts:/12972dfc0cb8466cb06ef7b1cec3d3ff', creation_time=1740837420214, experiment_id='0', last_update_time=1740837420214, lifecycle_stage='active', name='Sentimental Analysis on the IMDB DATASET', tags={}>

In [29]:
import mlflow
import logging
import time
import os
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression

In [30]:
logging.basicConfig(level=logging.INFO, format= "%(asctime)s - %(levelname)s - %(message)s")
logging.info("Starting the MLFLOW process and training the model")

with mlflow.start_run():
    start_time = time.time()
    logging.info("MLFLOW process has been started")

    try:
        logging.info("Logging the praameters")
        mlflow.log_param("vectorizers", "Bag of words")
        mlflow.log_param("max_features", 200)
        mlflow.log_param("test_size", 0.25)
        
        
        logging.info("Training the model using the Logistic Regression")
        model = LogisticRegression(max_iter= 1000)
        model.fit(X_train, y_train)
        logging.info("Model has been trained successfully")
        
        mlflow.log_param("model", "Logistic Regression")
        logging.info("Going to make predictions on the test data")
        y_pred = model.predict(X_test)
        
        logging.info("Calculating the metrics of the model")
        accuracy = accuracy_score(y_test, y_pred)
        precision = precision_score(y_test, y_pred)
        recall = recall_score(y_test, y_pred)
        f1 = f1_score(y_test, y_pred)
        
        logging.info("Logging the metrics of the model")
        mlflow.log_metric("accuracy", accuracy) 
        mlflow.log_metric("precision", precision)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("f1", f1)
        
        logging.info("MODEL PREDCITIONS HAVE BEEN MADE SUCCESSFULLY")
        logging.info("MLFLOW PROCESS HAS BEEN COMPLETED")
        
        mlflow.sklearn.log_model(model, "model")
        end_time = time.time()
        logging.info(f"Total time taken to complete the process is {end_time - start_time}")
        
        logging.info(f"Accuracy: {accuracy}")
        logging.info(f"Precision: {precision}")
        logging.info(f"Recall: {recall}")
        logging.info(f"F1 Score: {f1}")
        
        
    except Exception as e:
        logging.error(f"Error during the MLFLOW process {e}", exc_info= True)
        raise

2025-03-01 19:43:26,577 - INFO - Starting the MLFLOW process and training the model
2025-03-01 19:43:27,603 - INFO - MLFLOW process has been started
2025-03-01 19:43:27,605 - INFO - Logging the praameters
2025-03-01 19:43:28,737 - INFO - Training the model using the Logistic Regression
2025-03-01 19:43:28,792 - INFO - Model has been trained successfully
2025-03-01 19:43:29,187 - INFO - Going to make predictions on the test data
2025-03-01 19:43:29,189 - INFO - Calculating the metrics of the model
2025-03-01 19:43:29,201 - INFO - Logging the metrics of the model
2025-03-01 19:43:30,692 - INFO - MODEL PREDCITIONS HAVE BEEN MADE SUCCESSFULLY
2025-03-01 19:43:30,693 - INFO - MLFLOW PROCESS HAS BEEN COMPLETED
2025-03-01 19:43:41,854 - INFO - Total time taken to complete the process is 14.250206232070923
2025-03-01 19:43:41,855 - INFO - Accuracy: 0.656
2025-03-01 19:43:41,855 - INFO - Precision: 0.7058823529411765
2025-03-01 19:43:41,856 - INFO - Recall: 0.5625
2025-03-01 19:43:41,857 - INFO

🏃 View run lyrical-owl-476 at: https://dagshub.com/VaibhavRai24/MLOPS-Working-proj.mlflow/#/experiments/0/runs/fa489ff891cc4ed28a50c693a9ad7b3b
🧪 View experiment at: https://dagshub.com/VaibhavRai24/MLOPS-Working-proj.mlflow/#/experiments/0
