In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from imblearn.under_sampling import RandomUnderSampler
import joblib
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report,precision_score
from sklearn.metrics import classification_report

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pongs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Pongs\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pongs\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [15]:
# Combined text preprocessing and cleaning transformer
class TextPreprocessorCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words("english"))

    def remove_emoji(self, text):
        emoji_pattern = re.compile("["
                                   u"\U00010000-\U0010ffff"
                                   u"\U0001F600-\U0001F64F"
                                   u"\U0001F300-\U0001F5FF"
                                   u"\U0001F680-\U0001F6FF"
                                   u"\U0001F1E0-\U0001F1FF"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def clean_and_process(self, text):
        text = re.sub(r"<[^>]+>", "", text)  # Remove HTML tags
        text = text.lower()  # Convert to lowercase
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)  # Remove URLs
        text = re.sub(r'\S+@\S+', '', text)  # Remove emails
        text = re.sub(r'\d+', '', text)  # Remove digits
        text = text.strip()  # Remove extra spaces
        text = self.remove_emoji(text)  # Remove emojis
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation

        clean_text = [word for word in text.split() if word not in self.stop_words]
        lemmatized_text = [self.lemmatizer.lemmatize(word, 'v') for word in clean_text]

        return ' '.join(lemmatized_text)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X.apply(self.clean_and_process)

In [4]:
%%time
# Load the dataset
df = pd.read_csv("Review.csv")

CPU times: total: 22.5 s
Wall time: 1min 20s


In [16]:
%%time
# Ensure the column names are correct
X = df["Review"]  # Adjust if needed
Y = df["Sentiment"]  # Adjust if needed

# Split the dataset
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=123)

# Apply undersampling
undersampler = RandomUnderSampler()
x_train_resampled, y_train_resampled = undersampler.fit_resample(x_train.to_frame(), y_train)

CPU times: total: 21.6 s
Wall time: 2min 18s


In [17]:
%%time
# Encode the target labels (y) using LabelEncoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train_resampled)
y_test_encoded = label_encoder.transform(y_test)

# Create the pipeline with preprocessing and classifier
pipeline = make_pipeline(
    TextPreprocessorCleaner(),  # Custom text preprocessing
    TfidfVectorizer(),          # TF-IDF vectorization
    LinearSVC(C=0.1)            # SVM classifier
)

# Train the pipeline
pipeline.fit(x_train_resampled.squeeze(), y_train_encoded)

# Make predictions on the test set
y_pred = pipeline.predict(x_test)

# Decode the predicted labels back to original form
y_pred_decoded = label_encoder.inverse_transform(y_pred)

# Print classification report
print(accuracy_score(y_pred,y_test_encoded) * 100)
print(classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_))


0.817174960390946
              precision    recall  f1-score   support

    Negative       0.73      0.81      0.77    168023
     Neutral       0.34      0.62      0.44     79971
    Positive       0.96      0.84      0.90    679193

    accuracy                           0.82    927187
   macro avg       0.68      0.76      0.70    927187
weighted avg       0.87      0.82      0.84    927187

CPU times: total: 3min 40s
Wall time: 9min 27s


In [19]:
# Save the pipeline and label encoder using joblib
joblib.dump(pipeline, 'sentiment_pipeline.joblib')
joblib.dump(label_encoder, 'label_encoder.joblib')


['label_encoder.joblib']

## Write App

In [18]:
%%writefile app.py
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

import gradio as gr
import joblib
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import BaseEstimator, TransformerMixin

# Define the TextPreprocessorCleaner class
class TextPreprocessorCleaner(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words("english"))

    def remove_emoji(self, text):
        emoji_pattern = re.compile("["
                                   u"\U00010000-\U0010ffff"
                                   u"\U0001F600-\U0001F64F"
                                   u"\U0001F300-\U0001F5FF"
                                   u"\U0001F680-\U0001F6FF"
                                   u"\U0001F1E0-\U0001F1FF"
                                   u"\U00002702-\U000027B0"
                                   u"\U000024C2-\U0001F251"
                                   "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)

    def clean_and_process(self, text):
        text = re.sub(r"<[^>]+>", "", text)
        text = text.lower()
        text = re.sub(r"http\S+|www\S+|https\S+", '', text)
        text = re.sub(r'\S+@\S+', '', text)
        text = re.sub(r'\d+', '', text)
        text = text.strip()
        text = self.remove_emoji(text)
        text = re.sub(r'[^\w\s]', '', text)

        clean_text = [word for word in text.split() if word not in self.stop_words]
        lemmatized_text = [self.lemmatizer.lemmatize(word, 'v') for word in clean_text]

        return ' '.join(lemmatized_text)

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, str):
            return self.clean_and_process(X)
        elif isinstance(X, list):
            return [self.clean_and_process(text) for text in X]
        else:
            raise TypeError("Input data should be a string or a list of strings.")


# Load the trained model and label encoder
pipeline = joblib.load('sentiment_pipeline.joblib')
label_encoder = joblib.load('label_encoder.joblib')

def predict_sentiment(text):
    # Process the input text
    processed_text = pipeline.named_steps['textpreprocessorcleaner'].transform(text)
    if processed_text.strip() == "":
        return "Error: Processed text is empty. Please provide valid input."

    # Predict sentiment
    prediction = pipeline.predict([processed_text])
    
    # Decode the prediction to original labels
    decoded_prediction = label_encoder.inverse_transform(prediction)
    return decoded_prediction[0]

# Create the Gradio app with Blocks
with gr.Blocks() as demo:
    gr.Markdown("# Sentiment Analysis From Software Review Text")
    gr.Markdown("Your Review Of this Software")
    
    with gr.Row():
        with gr.Column():
            text_input = gr.Textbox(lines=2, placeholder="Enter your text here...")
            submit_button = gr.Button("Submit")
        
        with gr.Column():
            output_text = gr.Textbox(label="Sentiment")

    # Define the function to be called when the button is clicked
    submit_button.click(fn=predict_sentiment, inputs=text_input, outputs=output_text)

# Launch the app
if __name__ == "__main__":
    demo.queue(True).launch(debug=True)


Overwriting app.py


In [19]:
%run app.py

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Pongs\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Pongs\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Running on local URL:  http://127.0.0.1:7865

To create a public link, set `share=True` in `launch()`.


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7862 <> https://c488c6e61669309351.gradio.live
Killing tunnel 127.0.0.1:7863 <> https://d23dcffb12086ed528.gradio.live
Killing tunnel 127.0.0.1:7864 <> https://31fd72e0f0ccf21b5e.gradio.live


In [9]:
%%writefile requirements.txt
gradio==3.30.0
joblib
nltk
scikit-learn
pandas
imblearn


Writing requirements.txt
