In [1]:
import pandas as pd
import re
import string
from sklearn.preprocessing import Normalizer
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords as nltk_stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Shirs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
df = pd.read_csv("../data/raw/tweet_emotions.csv")
df.head()

Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


In [3]:
df.drop(columns=["tweet_id"], inplace=True)
df.head()

Unnamed: 0,sentiment,content
0,empty,@tiffanylue i know i was listenin to bad habi...
1,sadness,Layin n bed with a headache ughhhh...waitin o...
2,sadness,Funeral ceremony...gloomy friday...
3,enthusiasm,wants to hang out with friends SOON!
4,neutral,@dannycastillo We want to trade with someone w...


In [4]:
x=df['sentiment'].isin(['happiness', 'sadness'])
df=df[x]

In [5]:
df['sentiment'] = df['sentiment'].map({'happiness': 1, 'sadness': 0})
df.head()

Unnamed: 0,sentiment,content
1,0,Layin n bed with a headache ughhhh...waitin o...
2,0,Funeral ceremony...gloomy friday...
6,0,"I should be sleep, but im not! thinking about ..."
8,0,@charviray Charlene my love. I miss you
9,0,@kelcouch I'm sorry at least it's Friday?


In [6]:
# Complete text preprocessing functions
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer

# Initialize lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stopwords = set(nltk_stopwords.words('english'))

def lemmatize_text(text):
    """Apply lemmatization to reduce words to their base form"""
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in words]
    return ' '.join(lemmatized_words)

def remove_stopwords(text):
    """Remove common English stopwords"""
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stopwords]
    return ' '.join(filtered_words)

def preprocess_text(text):
    """Complete text preprocessing pipeline"""
    if pd.isna(text):
        return ""
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    
    # Remove user mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Remove stopwords
    text = remove_stopwords(text)
    
    # Apply lemmatization
    text = lemmatize_text(text)
    
    return text

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Shirs\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Shirs\AppData\Roaming\nltk_data...


[nltk_data]   Package omw-1.4 is already up-to-date!


In [7]:
# Applying preprocessing to the content column
print("Preprocessing the text data...")
df['processed_content'] = df['content'].apply(preprocess_text)

# Display comparison of original vs processed text
print("\nComparison of original vs processed text:")
for i in range(3):
    print(f"\nExample {i+1}:")
    print(f"Original: {df['content'].iloc[i]}")
    print(f"Processed: {df['processed_content'].iloc[i]}")

# Check for any empty processed content
empty_count = df['processed_content'].str.len().eq(0).sum()
print(f"\nNumber of tweets with empty processed content: {empty_count}")

# Remove rows with empty processed content
df = df[df['processed_content'].str.len() > 0]
print(f"Dataset shape after removing empty content: {df.shape}")

# Display final dataset info
print(f"\nFinal dataset shape: {df.shape}")
print(f"Sentiment distribution:")
print(df['sentiment'].value_counts())

Preprocessing the text data...

Comparison of original vs processed text:

Example 1:
Original: Layin n bed with a headache  ughhhh...waitin on your call...
Processed: layin n bed headache ughhhhwaitin call

Example 2:
Original: Funeral ceremony...gloomy friday...
Processed: funeral ceremonygloomy friday

Example 3:
Original: I should be sleep, but im not! thinking about an old friend who I want. but he's married now. damn, &amp; he wants me 2! scandalous!
Processed: sleep im thinking old friend want he married damn amp want scandalous

Number of tweets with empty processed content: 15
Dataset shape after removing empty content: (10359, 3)

Final dataset shape: (10359, 3)
Sentiment distribution:
sentiment
1    5202
0    5157
Name: count, dtype: int64


In [8]:
df.drop(columns=["content"], inplace=True)
df.head()

Unnamed: 0,sentiment,processed_content
1,0,layin n bed headache ughhhhwaitin call
2,0,funeral ceremonygloomy friday
6,0,sleep im thinking old friend want he married d...
8,0,charlene love miss
9,0,im sorry least friday


In [9]:
df.vectorizer = CountVectorizer(max_features=1000)
X=df.vectorizer.fit_transform(df['processed_content'])
y=df['sentiment']

In [10]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [11]:
import dagshub
dagshub.init(repo_owner='Pratt33', repo_name='mlops-miniproject', mlflow=True)

import mlflow
mlflow.set_tracking_uri("https://dagshub.com/Pratt33/mlops-miniproject.mlflow")
mlflow.set_experiment("logistic_regression_baseline")


<Experiment: artifact_location='mlflow-artifacts:/834f6c711f5f4a2daef13d33da39bf89', creation_time=1755198326144, experiment_id='0', last_update_time=1755198326144, lifecycle_stage='active', name='logistic_regression_baseline', tags={}>

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

with mlflow.start_run():
    mlflow.log_param('vectorizer', 'CountVectorizer/BOW')
    mlflow.log_param('max_features', 1000)
    mlflow.log_param('test_size', 0.2)

    #model building and training
    model = LogisticRegression()
    model.fit(X_train, y_train)

    #log model params
    mlflow.log_param('model', 'LogisticRegression')

    #model evaluation
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    #log eval metrics
    mlflow.log_metric('accuracy', accuracy)
    mlflow.log_metric('precision', precision)
    mlflow.log_metric('recall', recall)
    mlflow.log_metric('f1', f1)

    #log model
    mlflow.sklearn.log_model(model, "model")

    # Note: Logging the notebook as an artifact is not supported on Dagshub MLflow server.
    # If you need to save the notebook, do it manually or use a supported storage solution.
    # import os
    # notebook_path = os.path.basename(__file__)
    # os.system(f"jupyter nbconvert --to notebook --inplace {notebook_path}")
    # mlflow.log_artifact(notebook_path)



🏃 View run silent-pug-362 at: https://dagshub.com/Pratt33/mlops-miniproject.mlflow/#/experiments/0/runs/ae61fdb1b71d4563b9fe252e20bfac4f
🧪 View experiment at: https://dagshub.com/Pratt33/mlops-miniproject.mlflow/#/experiments/0


RestException: INTERNAL_ERROR: Response: {'error': 'unsupported endpoint, please contact support@dagshub.com'}