In [1]:
import nltk
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import pandas as pd
import numpy as np
import re
import pickle
import mlflow.pyfunc
import os
import mlflow

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\RR\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\RR\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
def lemmatization(text: str) -> str:
    lemmatizer = WordNetLemmatizer()

    text = text.split()
    text=[lemmatizer.lemmatize(y) for y in text]

    return " ".join(text)

def remove_stop_words(text: str) -> str:
    try:
        stop_words = set(stopwords.words("english"))
    except Exception:
        print("An error has occurred. If stopwords aren't there please download.")
        raise
    else:
        text=[i for i in str(text).split() if i not in stop_words]
        return " ".join(text)

def removing_numbers(text: str) -> str:
    text = "".join([i for i in text if not i.isdigit()])
    return text

def lower_case(text: str) -> str:
    text = text.split()

    text=[y.lower() for y in text]

    return " ".join(text)

def removing_punctuations(text: str) -> str:
    ## Remove Punctuations
    text = re.sub("[%s]" % re.escape("""!"#$%&'()*+,.-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = text.replace(':', "")

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text = " ".join(text.split())
    return text.strip()

def removing_urls(text: str) -> str:
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    return url_pattern.sub(r"", text)

def normalize_text(content: str) -> str:
    content = lower_case(content)
    content = remove_stop_words(content)
    content = removing_numbers(content)
    content = removing_punctuations(content)
    content = removing_urls(content)
    content = lemmatization(content)
    return content

  text = re.sub("[%s]" % re.escape("""!"#$%&'()*+,.-./:;<=>?@[\]^_`{|}~"""), ' ', text)
  text = re.sub('\s+', ' ', text)


In [4]:
text = "i miss my boo  on another note im soready for this game to come on tonight...fox grill anyone???!!"
norm_text = normalize_text(text)
norm_text

'miss boo another note im soready game come tonight fox grill anyone'

In [5]:
df = pd.read_csv("../data/external/emotion_dataset.csv")

In [6]:
with open("../models/vectorizer.pkl", "rb") as file:
    vectorizer = pickle.load(file)

text = vectorizer.transform([norm_text])

In [7]:
text = pd.DataFrame(text.toarray(), columns=vectorizer.get_feature_names_out())
text

Unnamed: 0,back,day,get,go,going,good,got,happy,http,im,like,lol,miss,one,quot,really,sad,time,today,work
0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0


In [8]:
# load model from model registry:
dagshub_token = os.getenv("DAGSHUB_PAT")
if not dagshub_token:
    raise EnvironmentError("DAGSHUB_PAT environment variable is not set.")

os.environ["MLFLOW_TRACKING_USERNAME"] = dagshub_token
os.environ["MLFLOW_TRACKING_PASSWORD"] = dagshub_token

dagshub_url = "https://dagshub.com"
repo_owner = "PriyanshuMewal"
repo_name = 'mini-project'

mlflow.set_tracking_uri(f"{dagshub_url}/{repo_owner}/{repo_name}.mlflow")

model_name = "emotion_detection"
alias = "champion"

model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}@{alias}")

print(model.model_id)

  from .autonotebook import tqdm as notebook_tqdm
Downloading artifacts: 100%|█████████████████████████████████████████████████████████████| 5/5 [00:02<00:00,  1.85it/s]


m-64af5c10319c43108e275487c3c61992


In [9]:
text.values

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0]])

In [13]:
model.predict(text)[0]

np.int64(0)

In [None]:
model.feature_names_in_