<a href="https://colab.research.google.com/github/Varun1324/Automated-Receipt-Parsing-using-AI/blob/main/RandomForest_Sentiment_Emotion.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from xgboost import XGBClassifier
import numpy as np

In [2]:
nltk.download('punkt')
nltk.download('punkt_tab')  # Download the punkt resource
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [3]:
# Load dataset
df_new = pd.read_csv('/content/expanded_dataset.csv')

In [4]:
df_new.head()

Unnamed: 0,id,text,label,sentiment,emotions
0,9536,"Cooking microwave pizzas, yummy",2,positive,happy
1,6135,Any plans of allowing sub tasks to show up in ...,1,neutral,indifferent
2,17697,"I love the humor, I just reworded it. Like sa...",2,positive,joyful
3,14182,naw idk what ur talkin about,1,neutral,curious
4,17840,That sucks to hear. I hate days like that,0,negative,upset


In [5]:
X = df_new['text']

In [6]:
X

Unnamed: 0,text
0,"Cooking microwave pizzas, yummy"
1,Any plans of allowing sub tasks to show up in ...
2,"I love the humor, I just reworded it. Like sa..."
3,naw idk what ur talkin about
4,That sucks to hear. I hate days like that
...,...
84995,Very disappointed with this purchase. It broke...
84996,"It does what it's supposed to do, but nothing ..."
84997,The product is okay. Nothing too special about...
84998,The product is okay. Nothing too special about...


In [7]:
# Drop unnecessary columns
df_new = df_new.drop(columns=["id", "label"]).reset_index(drop=True)

In [8]:
df_new.head()

Unnamed: 0,text,sentiment,emotions
0,"Cooking microwave pizzas, yummy",positive,happy
1,Any plans of allowing sub tasks to show up in ...,neutral,indifferent
2,"I love the humor, I just reworded it. Like sa...",positive,joyful
3,naw idk what ur talkin about,neutral,curious
4,That sucks to hear. I hate days like that,negative,upset


In [9]:
# Text Preprocessing Function
stop_words = set(stopwords.words('english'))

In [10]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'https?://\S+|www\.\S+', '', text)  # Remove URLs
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in stop_words]  # Remove stopwords
    return ' '.join(tokens)

In [11]:
# Apply text cleaning
df_new['text'] = df_new['text'].apply(clean_text)

In [12]:
# Encode categorical features
le_emotions = LabelEncoder()
df_new['emotions'] = le_emotions.fit_transform(df_new['emotions'])

# Encode sentiment labels (negative=0, neutral=1, positive=2)
le_sentiment = LabelEncoder()

df_new['sentiment'] = le_sentiment.fit_transform(df_new['sentiment'])

In [13]:
print(le_sentiment.classes_)

['negative' 'neutral' 'positive']


In [14]:
print(le_emotions.classes_)

['angry' 'calm' 'content' 'curious' 'disappointed' 'excited' 'frustrated'
 'grateful' 'happy' 'hopeful' 'indifferent' 'joyful' 'neutral' 'sad'
 'thoughtful' 'upset']


In [15]:
df_new.head()

Unnamed: 0,text,sentiment,emotions
0,cooking microwave pizzas yummy,2,8
1,plans allowing sub tasks show widget,1,10
2,love humor reworded like saying group therapy ...,2,11
3,naw idk ur talkin,1,3
4,sucks hear hate days like,0,15


In [16]:
# Convert text to TF-IDF features
tfidf_vectorizer = TfidfVectorizer(max_features=500, ngram_range=(1, 3))
tfidf_features = tfidf_vectorizer.fit_transform(df_new["text"]).toarray()
tfidf_df = pd.DataFrame(tfidf_features, columns=[f"tfidf_{i}" for i in range(tfidf_features.shape[1])])

In [17]:
# Drop text column and merge with TF-IDF features
df_new = df_new.drop(columns=["text"]).reset_index(drop=True)
df_new = pd.concat([df_new, tfidf_df], axis=1)

In [18]:
df_new.head()

Unnamed: 0,sentiment,emotions,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,...,tfidf_490,tfidf_491,tfidf_492,tfidf_493,tfidf_494,tfidf_495,tfidf_496,tfidf_497,tfidf_498,tfidf_499
0,2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
# Features & Labels
X = df_new.drop(columns=["sentiment", "emotions"])
y = df_new[["sentiment", "emotions"]]

In [20]:
# Split Data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
X_train.shape

(68000, 500)

In [22]:
df_new.head()

Unnamed: 0,sentiment,emotions,tfidf_0,tfidf_1,tfidf_2,tfidf_3,tfidf_4,tfidf_5,tfidf_6,tfidf_7,...,tfidf_490,tfidf_491,tfidf_492,tfidf_493,tfidf_494,tfidf_495,tfidf_496,tfidf_497,tfidf_498,tfidf_499
0,2,8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2,11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0,15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=50, min_samples_split=5,
                                  class_weight="balanced_subsample", n_jobs=-1, random_state=42)
multi_rf_model = MultiOutputClassifier(rf_model)

In [24]:
multi_rf_model.fit(X_train, y_train)

In [25]:
# Predictions
y_pred_rf = multi_rf_model.predict(X_test)

In [26]:
y_pred_sentiment_rf = le_sentiment.inverse_transform(np.clip(y_pred_rf[:, 0], 0, len(le_sentiment.classes_) - 1))
y_pred_emotion_rf = le_emotions.inverse_transform(np.clip(y_pred_rf[:, 1], 0, len(le_emotions.classes_) - 1))

In [27]:
accuracy_sentiment_rf = accuracy_score(y_test["sentiment"], y_pred_rf[:, 0])
accuracy_emotion_rf = accuracy_score(y_test["emotions"], y_pred_rf[:, 1])

In [28]:
print("\nRandom Forest Results:")
print(f"✅ Sentiment Prediction Accuracy: {accuracy_sentiment_rf:.4f}")
print(f"✅ Emotion Prediction Accuracy: {accuracy_emotion_rf:.4f}")


Random Forest Results:
✅ Sentiment Prediction Accuracy: 0.8601
✅ Emotion Prediction Accuracy: 0.1611


In [29]:
"""# **prediction check**"""

import re
import numpy as np
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer

In [30]:
def predict_sentiment_emotion(input_text, multi_rf_model, le_sentiment, le_emotion, categorical_values, expected_categorical_length, tfidf_vectorizer):


    # Step 1: Preprocess text (clean the text)
    cleaned_text = clean_text(input_text)

    # Step 2: Convert text to TF-IDF features
    tfidf_features = tfidf_vectorizer.transform([cleaned_text]).toarray()
    print(tfidf_features.shape)

    # Step 3: Ensure categorical values are added and match the training data's number of features
    if len(categorical_values) != expected_categorical_length:
        raise ValueError(f"Expected {expected_categorical_length} categorical features, but got {len(categorical_values)}.")

    categorical_array = np.array(categorical_values).reshape(1, -1)  # Convert list to numpy array
    input_features = np.hstack((tfidf_features, categorical_array))  # Combine TF-IDF features and categorical values
    print(categorical_array.shape)
    # Step 4: Ensure the number of features matches the model’s expected input
    print(input_features.shape)


    # Step 5: Predict sentiment and emotion
    prediction = multi_rf_model.predict(tfidf_features)

    # Step 6: Convert prediction back to original labels
    predicted_sentiment = le_sentiment.inverse_transform([prediction[0][0]])[0]
    predicted_emotion = le_emotion.inverse_transform([prediction[0][1]])[0]

    return {
        "sentiment": predicted_sentiment,
        "emotion": predicted_emotion
    }

# Example Usage:
sample_text = "i love this brand very much"
sample_categorical_values = [0]  # Replace with actual categorical values from training
expected_categorical_length = 1  # Adjust this to match the number of categorical features during training

# Assuming tfidf_vectorizer, multi_rf_model, le_sentiment, le_emotion are already trained and available
result = predict_sentiment_emotion(sample_text, multi_rf_model, le_sentiment, le_emotions, sample_categorical_values, expected_categorical_length, tfidf_vectorizer)
print(result)

(1, 500)
(1, 1)
(1, 501)




{'sentiment': 'positive', 'emotion': 'excited'}


In [31]:
le_sentiment.inverse_transform([2])

array(['positive'], dtype=object)

In [32]:
!pip install gradio

Collecting gradio
  Downloading gradio-5.17.1-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.8-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.7.1 (from gradio)
  Downloading gradio_client-1.7.1-py3-none-any.whl.metadata (7.1 kB)
Collecting markupsafe~=2.0 (from gradio)
  Downloading MarkupSafe-2.1.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.0 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.9.7-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.meta

In [33]:
import gradio as gr

def predict_sentiment_emotion_gradio(input_text):
    sample_categorical_values = [0]  # Replace with actual categorical values from training
    expected_categorical_length = 1  # Adjust this to match the number of categorical features during training
    result = predict_sentiment_emotion(input_text, multi_rf_model, le_sentiment, le_emotions, sample_categorical_values, expected_categorical_length, tfidf_vectorizer)
    return result["sentiment"], result["emotion"]

iface = gr.Interface(
    fn=predict_sentiment_emotion_gradio,
    inputs=gr.Textbox(lines=2, placeholder="Enter text here..."),
    outputs=[gr.Textbox(label="Sentiment"), gr.Textbox(label="Emotion")],
    title="Sentiment and Emotion Prediction",
    description="Enter some text, and the model will predict its sentiment and emotion."
)

iface.launch()

Running Gradio in a Colab notebook requires sharing enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://ca23643eb5fd81b7a4.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)




In [34]:
import joblib

# Train the model
#multi_rf_model.fit(X_train, y_train)  # Ensure X_train and y_train are defined

# Save the trained model using joblib
joblib.dump(multi_rf_model, "multi_rf_model.joblib")

['multi_rf_model.joblib']

In [35]:
# Load the saved model
loaded_model = joblib.load("/content/multi_rf_model.joblib")

# Use the loaded model for predictions
predictions = loaded_model.predict(X_test)  # Ensure X_test is defined

In [41]:
from sklearn.metrics import classification_report
# Print classification report for each label separately
for i, label in enumerate(y_test.columns):  # If y_test is a DataFrame
    print(f"Classification Report for {label}:\n")
    print(classification_report(y_test.iloc[:, i], predictions[:, i]))
    print("-" * 50)

Classification Report for sentiment:

              precision    recall  f1-score   support

           0       0.93      0.81      0.86      5437
           1       0.78      0.90      0.83      5961
           2       0.91      0.87      0.89      5602

    accuracy                           0.86     17000
   macro avg       0.87      0.86      0.86     17000
weighted avg       0.87      0.86      0.86     17000

--------------------------------------------------
Classification Report for emotions:

              precision    recall  f1-score   support

           0       0.19      0.14      0.16      1104
           1       0.40      0.00      0.00      1219
           2       0.00      0.00      0.00      1168
           3       0.08      0.85      0.15       426
           4       0.19      0.02      0.04      1124
           5       0.20      0.42      0.27      1116
           6       0.19      0.28      0.23      1075
           7       0.15      0.04      0.06      1077
      

In [None]:






























# y = df_new['sentiment']

# x_train,x_test,y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=42)



# feature = TfidfVectorizer(min_df=1,stop_words='english',lowercase=True)

# x_trian_features =  feature.fit_transform(x_train)
# x_test_features = feature.transform(x_test)

# print(x_trian_features[0:5])











# # Encode Labels
# le_emotion = LabelEncoder()
# y_train["sentiment"] = le_sentiment.fit_transform(y_train["sentiment"])
# y_train["emotion_mining"] = le_emotion.fit_transform(y_train["emotion_mining"])
# y_test["sentiment"] = le_sentiment.transform(y_test["sentiment"])
# y_test["emotion_mining"] = le_emotion.transform(y_test["emotion_mining"])

# print(le_emotion.classes_)



#X_train.shape





















# prompt: analysis the above model and dataset apply gradio for prediction with a text as input



