In [37]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import LabelEncoder
import re
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import dagshub
import os
import mlflow.sklearn
import mlflow

In [38]:
df = pd.read_csv("../data/external/emotion_dataset.csv")
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [39]:
df.drop(columns=["tweet_id"], inplace=True)
df = df[df["sentiment"].isin(["happiness", "sadness"])]

In [40]:
test_size = 0.2
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:, 1:], df.iloc[:, 0], test_size=test_size, random_state=42)

In [41]:
encode = LabelEncoder()
y_train = encode.fit_transform(y_train)
y_test = encode.transform(y_test)

In [42]:
def lemmatization(text: str) -> str:
    lemmatizer = WordNetLemmatizer()

    text = text.split()
    text=[lemmatizer.lemmatize(y) for y in text]

    return " ".join(text)

def remove_stop_words(text: str) -> str:
    try:
        stop_words = set(stopwords.words("english"))
    except Exception:
        print("An error has occurred. If stopwords aren't there please download.")
        raise
    else:
        text=[i for i in str(text).split() if i not in stop_words]
        return " ".join(text)

def removing_numbers(text: str) -> str:
    text = "".join([i for i in text if not i.isdigit()])
    return text

def lower_case(text: str) -> str:
    text = text.split()

    text=[y.lower() for y in text]

    return " ".join(text)

def removing_punctuations(text: str) -> str:
    ## Remove Punctuations
    text = re.sub("[%s]" % re.escape("""!"#$%&'()*+,.-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = text.replace(':', "")

    ## remove extra whitespace
    text = re.sub('\s+', ' ', text)
    text = " ".join(text.split())
    return text.strip()

def removing_urls(text: str) -> str:
    url_pattern = re.compile(r"https?://\S+|www\.\S+")
    return url_pattern.sub(r"", text)

def remove_small_sentences(df: pd.DataFrame) -> None:
    for i in range(len(df)):
        if len(df.text.iloc[i].split()) < 3:
            df.text.iloc[i] = np.nan

def normalize_text(df: pd.DataFrame) -> pd.DataFrame:
    df.content = df.content.apply(lambda content : lower_case(content))
    df.content = df.content.apply(lambda content : remove_stop_words(content))
    df.content = df.content.apply(lambda content : removing_numbers(content))
    df.content = df.content.apply(lambda content : removing_punctuations(content))
    df.content = df.content.apply(lambda content : removing_urls(content))
    df.content = df.content.apply(lambda content : lemmatization(content))
    return df

  text = re.sub("[%s]" % re.escape("""!"#$%&'()*+,.-./:;<=>?@[\]^_`{|}~"""), ' ', text)
  text = re.sub('\s+', ' ', text)


In [43]:
X_train = normalize_text(X_train)
X_test = normalize_text(X_test)

In [44]:
vectorizers = [CountVectorizer, TfidfVectorizer]
models = [LogisticRegression, GradientBoostingClassifier, RandomForestClassifier, XGBClassifier]
max_cols = 100

In [45]:
X_train

Unnamed: 0,content
23531,quot my problem miss you cause don t quot
8051,that s it done already one proof there s nothi...
11499,hungry food steal
31288,foot hurt finally bed will forget crunch over ...
18561,really ill atm
...,...
21697,chocolatesuze yes yes should especially wine m...
19445,kickzfadayz boy better get tonight
20216,tafe actually quite good
3258,minute boarding hour home window seat


In [48]:
mlflow.set_tracking_uri("https://dagshub.com/PriyanshuMewal/mini-project.mlflow")

dagshub.init(repo_owner='PriyanshuMewal', repo_name='mini-project', mlflow=True)

mlflow.set_experiment("Exp2: Best Combination of fe and model.")

with mlflow.start_run(run_name="best model") as parent:

    for vectorizer in vectorizers:
        for algo in models:
            
            with mlflow.start_run(run_name=f"{vectorizer.__name__}_and_{algo.__name__}", nested=True) as children:

                print(f"{vectorizer.__name__}_and_{algo.__name__}")
                
                vector = vectorizer(max_features=max_cols)
                X_train_trf_mat = vector.fit_transform(X_train["content"].values)
                X_test_trf_mat = vector.transform(X_test["content"].values)
            
                X_train_trf = pd.DataFrame.sparse.from_spmatrix(X_train_trf_mat, columns=vector.get_feature_names_out())
                X_test_trf = pd.DataFrame.sparse.from_spmatrix(X_test_trf_mat, columns=vector.get_feature_names_out())

                model = algo()
                model.fit(X_train_trf, y_train)
            
                y_pred = model.predict(X_test_trf)
            
                accuracy = accuracy_score(y_pred, y_test)
                precision = precision_score(y_pred, y_test)
                recall = recall_score(y_pred, y_test)
                f1 = f1_score(y_pred, y_test)
                
                # log params
                mlflow.log_param("max_features", max_cols)
                mlflow.log_param("test_size", test_size)
                mlflow.log_param("vectorizer", f"{vectorizer.__name__}")
                mlflow.log_param("model", f"{algo.__name__}")
                
                # log metrics
                mlflow.log_metric("accuracy", accuracy)
                mlflow.log_metric('precision', precision)
                mlflow.log_metric('recall', recall)
                mlflow.log_metric('f1_score', f1)
                
                # log source
                # notebook_path = "data_ingestion.ipynb"
                # os.system(f"jupyter nbconvert --to notebook --execute --inplace {notebook_path}")
                # mlflow.log_artifact(notebook_path)
                
                # log model
                mlflow.sklearn.log_model(model, name=f"{algo.__name__}")


CountVectorizer_and_LogisticRegression
üèÉ View run CountVectorizer_and_LogisticRegression at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/b6d4049bebec4812ab019245fa47d072
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
CountVectorizer_and_GradientBoostingClassifier
üèÉ View run CountVectorizer_and_GradientBoostingClassifier at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/98db260d4e0c4481a442dc38a09d2b3f
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
CountVectorizer_and_RandomForestClassifier
üèÉ View run CountVectorizer_and_RandomForestClassifier at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/9d35980decbb475bb7cf683882d36b7b
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
CountVectorizer_and_XGBClassifier




üèÉ View run CountVectorizer_and_XGBClassifier at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/f2c4786ac191461b866c8cf3b3f89a57
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
TfidfVectorizer_and_LogisticRegression
üèÉ View run TfidfVectorizer_and_LogisticRegression at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/24d30e21347d49229555c273f5154e0f
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
TfidfVectorizer_and_GradientBoostingClassifier
üèÉ View run TfidfVectorizer_and_GradientBoostingClassifier at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/fe337bfb803e4f1992db38d94a29836f
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
TfidfVectorizer_and_RandomForestClassifier
üèÉ View run TfidfVectorizer_and_RandomForestClassifier at: https://dagshub.com/P



üèÉ View run TfidfVectorizer_and_XGBClassifier at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/8c849666f91542039864bebe4dea249f
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
üèÉ View run best model at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3/runs/b2981fadb37d45ef9797befa2dd9d686
üß™ View experiment at: https://dagshub.com/PriyanshuMewal/mini-project.mlflow/#/experiments/3
